[clang] [llvm] [AArch64] Add Neon FP8 conversion intrinsics (PR #119033)

Mon Jan 6 08:53:06 PST 2025

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/119033

>From 4ca33ab1931d406fe34b04816eac53645e9877cf Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Jan 2025 18:39:13 +0000
Subject: [PATCH 01/11] Handle leading underscores in update_cc_test_checks.py

For some ABIs `update_cc_test_checks.py` is unable to generate
tests because of the mismatch between the mangled function names
reported by clang's `-asd-dump` and the function names in LLVM IR.

This patch fixes it by striping the leading underscore from
the mangled name for global functions if the data layout string says
the have one.
---
 clang/test/CodeGen/arm_neon_intrinsics.c | 30844 +++++++++++++--------
 llvm/utils/UpdateTestChecks/common.py    |    15 +
 llvm/utils/update_cc_test_checks.py      |    17 +-
 3 files changed, 18646 insertions(+), 12230 deletions(-)

diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 9f43dd2be5af58..1397e731d3da16 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
 // RUN:  -target-cpu swift \
 // RUN:  -target-feature +fullfp16 -ffreestanding \
@@ -9,20003 +10,26398 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vaba_s8(
-// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vaba_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vaba_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vaba_u8(
-// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vaba_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vaba_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_s8(
-// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[B]], <16 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c)
-// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[B]], <8 x i16> [[C]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c)
-// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[B]], <4 x i32> [[C]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_u8(
-// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[B]], <16 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c)
-// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[B]], <8 x i16> [[C]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabaq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c)
-// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[B]], <4 x i32> [[C]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_s8(
-// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u8(
-// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabd_s8(
-// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VABD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VABD_V_I]]
+//
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VABD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VABD_V2_I]]
+//
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VABD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VABD_V2_I]]
+//
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_u8(
-// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VABD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VABD_V_I]]
+//
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VABD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VABD_V2_I]]
+//
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VABD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VABD_V2_I]]
+//
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VABD_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VABD_V2_I]]
+//
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_s8(
-// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VABDQ_V_I]]
+//
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VABDQ_V2_I]]
+//
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VABDQ_V2_I]]
+//
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_u8(
-// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VABDQ_V_I]]
+//
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VABDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VABDQ_V2_I]]
+//
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VABDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VABDQ_V2_I]]
+//
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VABDQ_V2_I]]
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VABDQ_V2_I]]
+//
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s8(
-// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u8(
-// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vabs_s8(
-// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VABS_I]]
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VABS_I]]
+//
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: @test_vabs_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a)
-// CHECK:   ret <4 x i16> [[VABS1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VABS1_I]]
+//
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: @test_vabs_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VABS1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VABS1_I]]
+//
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: @test_vabs_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VABS1_I]]
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VABS1_I]]
+//
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s8(
-// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VABS_I]]
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VABS_I]]
+//
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a)
-// CHECK:   ret <8 x i16> [[VABS1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VABS1_I]]
+//
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VABS1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VABS1_I]]
+//
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VABS1_I]]
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VABS1_I]]
+//
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
-// CHECK-LABEL: @test_vadd_s8(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_s16(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_s32(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_s64(
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_f32(
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_u8(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_u16(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_u32(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vadd_u64(
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_s8(
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_s16(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_s32(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_s64(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_f32(
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_u8(
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_u16(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_u32(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_u64(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VADDHN2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
+//
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VADDHN2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
+//
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VADDHN2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
+//
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VADDHN2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
+//
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VADDHN2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
+//
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VADDHN2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
+//
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vand_s8(
-// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[AND_I]]
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[AND_I]]
+//
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vand_s16(
-// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[AND_I]]
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[AND_I]]
+//
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vand_s32(
-// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[AND_I]]
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[AND_I]]
+//
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vand_s64(
-// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[AND_I]]
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[AND_I]]
+//
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vand_u8(
-// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[AND_I]]
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[AND_I]]
+//
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vand_u16(
-// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[AND_I]]
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[AND_I]]
+//
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vand_u32(
-// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[AND_I]]
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[AND_I]]
+//
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vand_u64(
-// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[AND_I]]
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[AND_I]]
+//
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_s8(
-// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[AND_I]]
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[AND_I]]
+//
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_s16(
-// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[AND_I]]
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[AND_I]]
+//
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_s32(
-// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[AND_I]]
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[AND_I]]
+//
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_s64(
-// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[AND_I]]
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[AND_I]]
+//
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_u8(
-// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[AND_I]]
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[AND_I]]
+//
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_u16(
-// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[AND_I]]
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[AND_I]]
+//
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_u32(
-// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[AND_I]]
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[AND_I]]
+//
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vandq_u64(
-// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[AND_I]]
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[AND_I]]
+//
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1)
-// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
-// CHECK:   ret <8 x i8> [[AND_I]]
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i8> [[AND_I]]
+//
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1)
-// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
-// CHECK:   ret <4 x i16> [[AND_I]]
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i16> [[AND_I]]
+//
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1)
-// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
-// CHECK:   ret <2 x i32> [[AND_I]]
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i32> [[AND_I]]
+//
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_s64(
-// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1)
-// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
-// CHECK:   ret <1 x i64> [[AND_I]]
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[AND_I]]
+//
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1)
-// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
-// CHECK:   ret <8 x i8> [[AND_I]]
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i8> [[AND_I]]
+//
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1)
-// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
-// CHECK:   ret <4 x i16> [[AND_I]]
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i16> [[AND_I]]
+//
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1)
-// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
-// CHECK:   ret <2 x i32> [[AND_I]]
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i32> [[AND_I]]
+//
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vbic_u64(
-// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1)
-// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
-// CHECK:   ret <1 x i64> [[AND_I]]
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[AND_I]]
+//
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1)
-// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
-// CHECK:   ret <16 x i8> [[AND_I]]
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <16 x i8> [[AND_I]]
+//
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1)
-// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
-// CHECK:   ret <8 x i16> [[AND_I]]
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i16> [[AND_I]]
+//
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1)
-// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
-// CHECK:   ret <4 x i32> [[AND_I]]
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i32> [[AND_I]]
+//
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_s64(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1)
-// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
-// CHECK:   ret <2 x i64> [[AND_I]]
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i64> [[AND_I]]
+//
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1)
-// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
-// CHECK:   ret <16 x i8> [[AND_I]]
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <16 x i8> [[AND_I]]
+//
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1)
-// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
-// CHECK:   ret <8 x i16> [[AND_I]]
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i16> [[AND_I]]
+//
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1)
-// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
-// CHECK:   ret <4 x i32> [[AND_I]]
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i32> [[AND_I]]
+//
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vbicq_u64(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1)
-// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
-// CHECK:   ret <2 x i64> [[AND_I]]
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i64> [[AND_I]]
+//
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vbsl_s8(
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VBSL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VBSL_V_I]]
+//
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+//
 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+//
 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+//
 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_u8(
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VBSL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VBSL_V_I]]
+//
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+//
 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+//
 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+//
 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+//
 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_p8(
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VBSL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VBSL_V_I]]
+//
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbsl_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+//
 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_s8(
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+// CHECK-NEXT:    ret <16 x i8> [[VBSLQ_V_I]]
+//
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+//
 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_u8(
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+// CHECK-NEXT:    ret <16 x i8> [[VBSLQ_V_I]]
+//
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+//
 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+//
 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_p8(
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
+// CHECK-NEXT:    ret <16 x i8> [[VBSLQ_V_I]]
+//
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vbslq_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
-// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+//
 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vcage_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VCAGE_V2_I]]
+//
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcageq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VCAGEQ_V2_I]]
+//
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcagt_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VCAGT_V2_I]]
+//
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcagtq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VCAGTQ_V2_I]]
+//
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcale_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCALE_V2_I]]
+//
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcaleq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCALEQ_V2_I]]
+//
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcalt_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCALT_V2_I]]
+//
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcaltq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCALTQ_V2_I]]
+//
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_p8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vceqq_p8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgeq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcleq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcls_s8(
-// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCLS_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCLS_V_I]]
+//
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: @test_vcls_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
+//
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: @test_vcls_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
+//
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: @test_vcls_u8(
-// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCLS_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcls_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCLS_V_I]]
+//
 int8x8_t test_vcls_u8(uint8x8_t a) {
   return vcls_u8(a);
 }
 
-// CHECK-LABEL: @test_vcls_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcls_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
+//
 int16x4_t test_vcls_u16(uint16x4_t a) {
   return vcls_u16(a);
 }
 
-// CHECK-LABEL: @test_vcls_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcls_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
+//
 int32x2_t test_vcls_u32(uint32x2_t a) {
   return vcls_u32(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s8(
-// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCLSQ_V_I]]
+//
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
+//
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
+//
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u8(
-// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vclsq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCLSQ_V_I]]
+//
 int8x16_t test_vclsq_u8(uint8x16_t a) {
   return vclsq_u8(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vclsq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
+//
 int16x8_t test_vclsq_u16(uint16x8_t a) {
   return vclsq_u16(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vclsq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
+//
 int32x4_t test_vclsq_u32(uint32x4_t a) {
   return vclsq_u32(a);
 }
 
-// CHECK-LABEL: @test_vclt_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcltq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vclz_s8(
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: @test_vclz_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: @test_vclz_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: @test_vclz_u8(
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: @test_vclz_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: @test_vclz_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s8(
-// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
-// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
+//
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
+//
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
+//
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u8(
-// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
-// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
+//
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
+//
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
+//
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
-// CHECK-LABEL: @test_vcnt_u8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: @test_vcnt_s8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: @test_vcnt_p8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_u8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_s8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_p8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
-// CHECK-LABEL: @test_vcombine_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vcombine_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vcreate_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: @test_vcreate_imm
-// CHECK: [[RES:%.*]] = bitcast i64 0 to <4 x i16>
-// CHECK: ret <4 x i16> [[RES]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_imm(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 0 to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vcreate_imm(void) {
   return vcreate_s16(0);
 }
 
-// CHECK-LABEL: @test_vcreate_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: @test_vcreate_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: @test_vcreate_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vcreate_f16(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: @test_vcreate_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vcreate_f32(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: @test_vcreate_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 int8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8((int8x8_t)vcreate_u8(a));
 }
 
-// CHECK-LABEL: @test_vcreate_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 int16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16((int16x4_t)vcreate_u16(a));
 }
 
-// CHECK-LABEL: @test_vcreate_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 int32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32((int32x2_t)vcreate_u32(a));
 }
 
-// CHECK-LABEL: @test_vcreate_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vcreate_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: @test_vcreate_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP4]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+//
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
   return vbsl_p16((uint16x4_t)tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vcreate_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vcreate_s64(uint64_t a) {
   int64x1_t tmp = vcreate_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vcvt_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a)
-// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP1]]
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[A]])
+// CHECK-NEXT:    [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP1]]
+//
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[VCVT_I]]
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+//
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[VCVT_I]]
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+//
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[VCVT_I]]
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
+//
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[VCVT_I]]
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
+//
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
-// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
+// CHECK-NEXT:    [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I]]
+//
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
-// CHECK:   ret <2 x float> [[VCVT_N1]]
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
+//
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
-// CHECK:   ret <2 x float> [[VCVT_N1]]
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
+//
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
-// CHECK:   ret <4 x float> [[VCVT_N1]]
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
+//
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
-// CHECK:   ret <4 x float> [[VCVT_N1]]
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
+//
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vcvt_n_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
-// CHECK:   ret <2 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
+//
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
-// CHECK:   ret <4 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
+//
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
-// CHECK-LABEL: @test_vcvt_n_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
-// CHECK:   ret <2 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
+//
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
-// CHECK:   ret <4 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
+//
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
-// CHECK-LABEL: @test_vcvt_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCVT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_I]]
+//
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCVT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_I]]
+//
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCVT_I]]
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_I]]
+//
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCVT_I]]
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_I]]
+//
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vdup_lane_u8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdup_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdup_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdup_lane_s8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdup_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdup_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdup_lane_p8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdup_lane_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdup_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x float> [[LANE]]
+// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    ret <2 x float> [[LANE]]
+//
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_u8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <16 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_s8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <16 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_p8(
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <16 x i8> [[SHUFFLE]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   ret <4 x float> [[LANE]]
+// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    ret <4 x float> [[LANE]]
+//
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vdup_lane_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: @test_vdup_lane_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: @test_vdupq_lane_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: @test_vdup_n_u8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
+//
 uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_s8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
+//
 int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_p8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_p16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_f16(
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   ret <4 x half> [[VECINIT3]]
+// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
 float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: @test_vdup_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
-// CHECK:   ret <2 x float> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    ret <2 x float> [[VECINIT1_I]]
+//
 float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_u8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_s8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
 int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_p8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_p16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_f16(
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
-// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
-// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
-// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
-// CHECK:   ret <8 x half> [[VECINIT7]]
+// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
 float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: @test_vdupq_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
-// CHECK:   ret <4 x float> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VECINIT3_I]]
+//
 float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: @test_vdup_n_s64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vdup_n_s64(int64_t a) {
   int64x1_t tmp = vdup_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vdup_n_u64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = (int64x1_t)vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vdupq_n_s64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vdupq_n_s64(int64_t a) {
   int64x2_t tmp = vdupq_n_s64(a);
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vdupq_n_u64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   uint64x2_t tmp = vdupq_n_u64(a);
   return vaddq_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_veor_s8(
-// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[XOR_I]]
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[XOR_I]]
+//
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: @test_veor_s16(
-// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[XOR_I]]
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[XOR_I]]
+//
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: @test_veor_s32(
-// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[XOR_I]]
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[XOR_I]]
+//
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: @test_veor_s64(
-// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[XOR_I]]
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[XOR_I]]
+//
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: @test_veor_u8(
-// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[XOR_I]]
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[XOR_I]]
+//
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: @test_veor_u16(
-// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[XOR_I]]
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[XOR_I]]
+//
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: @test_veor_u32(
-// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[XOR_I]]
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[XOR_I]]
+//
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: @test_veor_u64(
-// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[XOR_I]]
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[XOR_I]]
+//
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_s8(
-// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[XOR_I]]
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[XOR_I]]
+//
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_s16(
-// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[XOR_I]]
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[XOR_I]]
+//
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_s32(
-// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[XOR_I]]
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[XOR_I]]
+//
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_s64(
-// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[XOR_I]]
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[XOR_I]]
+//
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_u8(
-// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[XOR_I]]
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[XOR_I]]
+//
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_u16(
-// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[XOR_I]]
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[XOR_I]]
+//
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_u32(
-// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[XOR_I]]
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[XOR_I]]
+//
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_veorq_u64(
-// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[XOR_I]]
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[XOR_I]]
+//
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vext_s8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vext_u8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vext_p8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vext_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vext_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vext_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vext_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i32> [[VEXT]]
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[VEXT]]
+//
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vext_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i32> [[VEXT]]
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[VEXT]]
+//
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vext_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[VEXT]]
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[VEXT]]
+//
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vext_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[VEXT]]
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[VEXT]]
+//
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vext_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x float> [[VEXT]]
+// CHECK-LABEL: define <2 x float> @test_vext_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x float> [[VEXT]]
+//
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vextq_s8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vextq_u8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vextq_p8(
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vextq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vextq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vextq_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vextq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i32> [[VEXT]]
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[VEXT]]
+//
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vextq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i32> [[VEXT]]
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[VEXT]]
+//
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vextq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i64> [[VEXT]]
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VEXT]]
+//
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vextq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i64> [[VEXT]]
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VEXT]]
+//
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vextq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x float> [[VEXT]]
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x float> [[VEXT]]
+//
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vfma_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a)
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[C]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+//
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vfmaq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[C]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vfms_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a)
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define <2 x float> @test_vfms_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[C]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+//
 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfms_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vfmsq_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a)
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[C]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vget_high_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: @test_vget_high_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: @test_vget_high_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: @test_vget_high_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
-// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
+//
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: @test_vget_high_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: @test_vget_high_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: @test_vget_high_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: @test_vget_high_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: @test_vget_high_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: @test_vget_high_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
-// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
+//
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: @test_vget_high_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: @test_vget_high_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
-// CHECK-LABEL: @test_vget_lane_u8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: @test_vget_lane_u16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vget_lane_u32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1
-// CHECK:   ret i32 [[VGET_LANE]]
+// CHECK-LABEL: define i32 @test_vget_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1
+// CHECK-NEXT:    ret i32 [[VGET_LANE]]
+//
 uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vget_lane_s8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define signext i8 @test_vget_lane_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: @test_vget_lane_s16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define signext i16 @test_vget_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vget_lane_s32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1
-// CHECK:   ret i32 [[VGET_LANE]]
+// CHECK-LABEL: define i32 @test_vget_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1
+// CHECK-NEXT:    ret i32 [[VGET_LANE]]
+//
 int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vget_lane_p8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define signext i8 @test_vget_lane_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: @test_vget_lane_p16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define signext i16 @test_vget_lane_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: @test_vget_lane_f32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %a, i32 1
-// CHECK:   ret float [[VGET_LANE]]
+// CHECK-LABEL: define float @test_vget_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[A]], i32 1
+// CHECK-NEXT:    ret float [[VGET_LANE]]
+//
 float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vget_lane_f16(
-// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
-// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
-// CHECK:   store <4 x half> %a, ptr [[__REINT_242]], align 8
-// CHECK:   [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_242]], align 8
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
-// CHECK:   store i16 [[VGET_LANE]], ptr [[__REINT1_242]], align 2
-// CHECK:   [[TMP5:%.*]] = load half, ptr [[__REINT1_242]], align 2
-// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
-// CHECK:   ret float [[CONV]]
+// CHECK-LABEL: define float @test_vget_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_831:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_831:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <4 x half> [[A]], ptr [[__REINT_831]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_831]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
+// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_831]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_831]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP1]] to float
+// CHECK-NEXT:    ret float [[CONV]]
+//
 float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_u8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_u16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_u32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3
-// CHECK:   ret i32 [[VGET_LANE]]
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3
+// CHECK-NEXT:    ret i32 [[VGET_LANE]]
+//
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_s8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_s16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_s32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3
-// CHECK:   ret i32 [[VGET_LANE]]
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3
+// CHECK-NEXT:    ret i32 [[VGET_LANE]]
+//
 int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_p8(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
-// CHECK:   ret i8 [[VGET_LANE]]
+// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    ret i8 [[VGET_LANE]]
+//
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_p16(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7
-// CHECK:   ret i16 [[VGET_LANE]]
+// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    ret i16 [[VGET_LANE]]
+//
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_f32(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> %a, i32 3
-// CHECK:   ret float [[VGET_LANE]]
+// CHECK-LABEL: define float @test_vgetq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x float> [[A]], i32 3
+// CHECK-NEXT:    ret float [[VGET_LANE]]
+//
 float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_f16(
-// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
-// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
-// CHECK:   store <8 x half> %a, ptr [[__REINT_244]], align 16
-// CHECK:   [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_244]], align 16
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
-// CHECK:   store i16 [[VGET_LANE]], ptr [[__REINT1_244]], align 2
-// CHECK:   [[TMP5:%.*]] = load half, ptr [[__REINT1_244]], align 2
-// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
-// CHECK:   ret float [[CONV]]
+// CHECK-LABEL: define float @test_vgetq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_834:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_834:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[__REINT_834]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_834]], align 16
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3
+// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_834]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_834]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP1]] to float
+// CHECK-NEXT:    ret float [[CONV]]
+//
 float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: @test_vget_lane_s64(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0
-// CHECK:   ret i64 [[VGET_LANE]]
+// CHECK-LABEL: define i64 @test_vget_lane_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0
+// CHECK-NEXT:    ret i64 [[VGET_LANE]]
+//
 int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: @test_vget_lane_u64(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0
-// CHECK:   ret i64 [[VGET_LANE]]
+// CHECK-LABEL: define i64 @test_vget_lane_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0
+// CHECK-NEXT:    ret i64 [[VGET_LANE]]
+//
 uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_s64(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1
-// CHECK:   ret i64 [[VGET_LANE]]
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1
+// CHECK-NEXT:    ret i64 [[VGET_LANE]]
+//
 int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vgetq_lane_u64(
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1
-// CHECK:   ret i64 [[VGET_LANE]]
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1
+// CHECK-NEXT:    ret i64 [[VGET_LANE]]
+//
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vget_low_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: @test_vget_low_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: @test_vget_low_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: @test_vget_low_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
+//
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: @test_vget_low_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: @test_vget_low_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: @test_vget_low_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: @test_vget_low_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: @test_vget_low_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: @test_vget_low_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
+//
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: @test_vget_low_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: @test_vget_low_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
-// CHECK-LABEL: @test_vhadd_s8(
-// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VHADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
+//
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vhadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
+//
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vhadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
+//
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vhadd_u8(
-// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VHADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
+//
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vhadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
+//
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vhadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
+//
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_s8(
-// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
+//
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
+//
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
+//
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_u8(
-// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
+//
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
+//
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vhaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
+//
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_s8(
-// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VHSUB_V_I]]
+//
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
+//
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
+//
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_u8(
-// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VHSUB_V_I]]
+//
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
+//
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vhsub_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
+//
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_s8(
-// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VHSUBQ_V_I]]
+//
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
+//
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
+//
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_u8(
-// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VHSUBQ_V_I]]
+//
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
+//
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vhsubq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
+//
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vld1q_u8(
-// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
-// CHECK:   ret <16 x i8> [[VLD1]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLD1]]
+//
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u16(
-// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
-// CHECK:   ret <8 x i16> [[VLD1]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VLD1]]
+//
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u32(
-// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4)
-// CHECK:   ret <4 x i32> [[VLD1]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <4 x i32> [[VLD1]]
+//
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u64(
-// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4)
-// CHECK:   ret <2 x i64> [[VLD1]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <2 x i64> [[VLD1]]
+//
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s8(
-// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
-// CHECK:   ret <16 x i8> [[VLD1]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLD1]]
+//
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s16(
-// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
-// CHECK:   ret <8 x i16> [[VLD1]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VLD1]]
+//
 int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s32(
-// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4)
-// CHECK:   ret <4 x i32> [[VLD1]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <4 x i32> [[VLD1]]
+//
 int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s64(
-// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4)
-// CHECK:   ret <2 x i64> [[VLD1]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <2 x i64> [[VLD1]]
+//
 int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f16(
-// CHECK:   [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr %a, i32 2)
-// CHECK:   ret <8 x half> [[VLD1]]
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x half> [[VLD1]]
+//
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f32(
-// CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %a, i32 4)
-// CHECK:   ret <4 x float> [[VLD1]]
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <4 x float> [[VLD1]]
+//
 float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p8(
-// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1)
-// CHECK:   ret <16 x i8> [[VLD1]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[VLD1]]
+//
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p16(
-// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2)
-// CHECK:   ret <8 x i16> [[VLD1]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VLD1]]
+//
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8(
-// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
-// CHECK:   ret <8 x i8> [[VLD1]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <8 x i8> [[VLD1]]
+//
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1_u16(
-// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
-// CHECK:   ret <4 x i16> [[VLD1]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VLD1]]
+//
 uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u32(
-// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4)
-// CHECK:   ret <2 x i32> [[VLD1]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <2 x i32> [[VLD1]]
+//
 uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1_u64(
-// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
-// CHECK:   ret <1 x i64> [[VLD1]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <1 x i64> [[VLD1]]
+//
 uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1_s8(
-// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
-// CHECK:   ret <8 x i8> [[VLD1]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <8 x i8> [[VLD1]]
+//
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1_s16(
-// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
-// CHECK:   ret <4 x i16> [[VLD1]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VLD1]]
+//
 int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1_s32(
-// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4)
-// CHECK:   ret <2 x i32> [[VLD1]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <2 x i32> [[VLD1]]
+//
 int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1_s64(
-// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
-// CHECK:   ret <1 x i64> [[VLD1]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <1 x i64> [[VLD1]]
+//
 int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1_f16(
-// CHECK:   [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr %a, i32 2)
-// CHECK:   ret <4 x half> [[VLD1]]
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x half> [[VLD1]]
+//
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1_f32(
-// CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr %a, i32 4)
-// CHECK:   ret <2 x float> [[VLD1]]
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    ret <2 x float> [[VLD1]]
+//
 float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8(
-// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1)
-// CHECK:   ret <8 x i8> [[VLD1]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    ret <8 x i8> [[VLD1]]
+//
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1_p16(
-// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2)
-// CHECK:   ret <4 x i16> [[VLD1]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VLD1]]
+//
 poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_u8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_u16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_u32(
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_u64(
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_s8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_s16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_s32(
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_s64(
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_f16(
-// CHECK:   [[TMP2:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_f32(
-// CHECK:   [[TMP2:%.*]] = load float, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x float> [[LANE]]
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x float> [[LANE]]
+//
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_p8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_dup_p16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_u8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_u16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_u32(
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_u64(
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_s8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_s16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_s32(
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_s64(
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_f16(
-// CHECK:   [[TMP2:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_f32(
-// CHECK:   [[TMP2:%.*]] = load float, ptr %a, align 4
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x float> [[LANE]]
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x float> [[LANE]]
+//
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_p8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1_dup_p16(
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_u8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
-// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[VLD1Q_LANE]]
+//
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_s8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4)
-// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[VLD1Q_LANE]]
+//
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP4:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
-// CHECK:   ret <8 x half> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP1]], half [[TMP2]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VLD1_LANE]]
+//
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = load float, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
-// CHECK:   ret <4 x float> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VLD1_LANE]]
+//
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_p8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vld1q_lane_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1_lane_u8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1_lane_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1_lane_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld1_lane_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vld1_lane_s8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1_lane_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1_lane_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld1_lane_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vld1_lane_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP4:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
-// CHECK:   ret <4 x half> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP1]], half [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VLD1_LANE]]
+//
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld1_lane_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = load float, ptr %a, align 4
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
-// CHECK:   ret <2 x float> [[VLD1_LANE]]
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i32 1
+// CHECK-NEXT:    ret <2 x float> [[VLD1_LANE]]
+//
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld1_lane_p8(
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a, align 1
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld1_lane_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a, align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld2q_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld2q_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld2q_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld2q_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld2q_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2.v8f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld2q_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld2q_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld2_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: @test_vld2_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: @test_vld2_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld2_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: @test_vld2_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld2_u64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: @test_vld2_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: @test_vld2_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: @test_vld2_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld2_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: @test_vld2_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld2_s64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: @test_vld2_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld2_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2.v4f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: @test_vld2_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld2_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: @test_vld2_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: @test_vld2_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld2q_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld2q_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld2q_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld2q_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld2q_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_lane_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld2_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld2_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_lane_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld2_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld2_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld2_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld2_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld2_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld2_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld2_lane_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld2_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld2_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld3q_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld3q_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld3q_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld3q_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld3q_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3.v8f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld3q_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld3q_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld3_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: @test_vld3_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: @test_vld3_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld3_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: @test_vld3_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld3_u64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: @test_vld3_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: @test_vld3_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: @test_vld3_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld3_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: @test_vld3_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld3_s64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: @test_vld3_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld3_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3.v4f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: @test_vld3_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld3_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: @test_vld3_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: @test_vld3_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld3q_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld3q_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld3q_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld3q_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld3q_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_lane_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld3_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld3_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_lane_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld3_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld3_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld3_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld3_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld3_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld3_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld3_lane_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld3_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld3_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld4q_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld4q_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld4q_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld4q_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld4q_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4.v8f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld4q_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
+// CHECK-LABEL: define void @test_vld4q_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld4_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: @test_vld4_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: @test_vld4_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld4_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: @test_vld4_u64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld4_u64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: @test_vld4_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: @test_vld4_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: @test_vld4_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld4_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: @test_vld4_s64(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
+// CHECK-LABEL: define void @test_vld4_s64(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: @test_vld4_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld4_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4.v4f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: @test_vld4_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld4_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: @test_vld4_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: @test_vld4_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld4q_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
+// CHECK-LABEL: define void @test_vld4q_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
+// CHECK-LABEL: define void @test_vld4q_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
+// CHECK-LABEL: define void @test_vld4q_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK-LABEL: define void @test_vld4q_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_lane_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_lane_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld4_lane_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld4_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_lane_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_lane_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
+// CHECK-LABEL: define void @test_vld4_lane_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld4_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
+// CHECK-LABEL: define void @test_vld4_lane_f16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vld4_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
+// CHECK-LABEL: define void @test_vld4_lane_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vld4_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
+// CHECK-LABEL: define void @test_vld4_lane_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vld4_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK-LABEL: define void @test_vld4_lane_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    ret void
+//
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmax_s8(
-// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMAX_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMAX_V_I]]
+//
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VMAX_V2_I]]
+//
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VMAX_V2_I]]
+//
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u8(
-// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMAX_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMAX_V_I]]
+//
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VMAX_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VMAX_V2_I]]
+//
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VMAX_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VMAX_V2_I]]
+//
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VMAX_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VMAX_V2_I]]
+//
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s8(
-// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMAXQ_V_I]]
+//
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VMAXQ_V2_I]]
+//
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VMAXQ_V2_I]]
+//
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u8(
-// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMAXQ_V_I]]
+//
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VMAXQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VMAXQ_V2_I]]
+//
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VMAXQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VMAXQ_V2_I]]
+//
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VMAXQ_V2_I]]
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VMAXQ_V2_I]]
+//
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s8(
-// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMIN_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMIN_V_I]]
+//
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VMIN_V2_I]]
+//
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VMIN_V2_I]]
+//
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u8(
-// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMIN_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMIN_V_I]]
+//
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VMIN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VMIN_V2_I]]
+//
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VMIN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VMIN_V2_I]]
+//
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VMIN_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VMIN_V2_I]]
+//
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s8(
-// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMINQ_V_I]]
+//
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VMINQ_V2_I]]
+//
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VMINQ_V2_I]]
+//
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u8(
-// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMINQ_V_I]]
+//
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VMINQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VMINQ_V2_I]]
+//
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VMINQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VMINQ_V2_I]]
+//
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VMINQ_V2_I]]
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VMINQ_V2_I]]
+//
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmla_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_s8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <4 x i32> [[ADD]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD]]
+//
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlal_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <2 x i64> [[ADD]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD]]
+//
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlal_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <4 x i32> [[ADD]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD]]
+//
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlal_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <2 x i64> [[ADD]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD]]
+//
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlal_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i16> [[ADD]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD]]
+//
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmla_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x i32> [[ADD]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD]]
+//
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmla_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i16> [[ADD]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD]]
+//
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmla_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x i32> [[ADD]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD]]
+//
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmla_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[ADD]]
+//
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlaq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <8 x i16> [[ADD]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD]]
+//
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlaq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i32> [[ADD]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD]]
+//
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlaq_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <8 x i16> [[ADD]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD]]
+//
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlaq_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i32> [[ADD]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD]]
+//
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlaq_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[ADD]]
+//
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmla_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmla_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlaq_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <4 x i32> [[SUB]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB]]
+//
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlsl_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <2 x i64> [[SUB]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB]]
+//
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlsl_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <4 x i32> [[SUB]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB]]
+//
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlsl_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
-// CHECK:   ret <2 x i64> [[SUB]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB]]
+//
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlsl_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i16> [[SUB]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB]]
+//
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmls_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x i32> [[SUB]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB]]
+//
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmls_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i16> [[SUB]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB]]
+//
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmls_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x i32> [[SUB]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB]]
+//
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmls_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[SUB]]
+//
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlsq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <8 x i16> [[SUB]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB]]
+//
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlsq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i32> [[SUB]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB]]
+//
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlsq_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
-// CHECK:   ret <8 x i16> [[SUB]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB]]
+//
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vmlsq_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x i32> [[SUB]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB]]
+//
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmlsq_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vmls_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsq_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmovl_s8(
-// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u8(
-// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[VMOVN_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
+//
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[VMOVN_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
+//
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VMOVN_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
+//
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[VMOVN_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
+//
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[VMOVN_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
+//
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VMOVN_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
+//
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_u8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
+//
 uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_s8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
+//
 int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_p8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
+//
 poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_p16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
+//
 poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_f16(
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   ret <4 x half> [[VECINIT3]]
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
 float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: @test_vmov_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
-// CHECK:   ret <2 x float> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    ret <2 x float> [[VECINIT1_I]]
+//
 float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_u8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_s8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
 int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_p8(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
-// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
-// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
-// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
-// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
-// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
-// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
-// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
-// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VECINIT15_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
+//
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_p16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VECINIT7_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
+//
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_f16(
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
-// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
-// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
-// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
-// CHECK:   ret <8 x half> [[VECINIT7]]
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
 float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
-// CHECK:   ret <4 x float> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VECINIT3_I]]
+//
 float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: @test_vmov_n_s64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vmov_n_s64(int64_t a) {
   int64x1_t tmp = vmov_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vmov_n_u64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 uint64x1_t test_vmov_n_u64(uint64_t a) {
   uint64x1_t tmp = vmov_n_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: @test_vmovq_n_s64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
+//
 int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: @test_vmovq_n_u64(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
+//
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
-// CHECK-LABEL: @test_vmul_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[MUL_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[MUL_I]]
+//
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
-// CHECK:   ret <2 x float> [[MUL_I]]
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x float> [[MUL_I]]
+//
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[MUL_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[MUL_I]]
+//
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[MUL_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[MUL_I]]
+//
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
-// CHECK:   ret <4 x float> [[MUL_I]]
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x float> [[MUL_I]]
+//
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[MUL_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[MUL_I]]
+//
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_s8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_p8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmull_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmull_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmull_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmull_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
-// CHECK:   ret <4 x i32> [[VMULL5_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
+//
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
-// CHECK:   ret <2 x i64> [[VMULL3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
+//
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
-// CHECK:   ret <4 x i32> [[VMULL5_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
+//
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
-// CHECK:   ret <2 x i64> [[VMULL3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
+//
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_p8(
-// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMUL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMUL_V_I]]
+//
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_p8(
-// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMULQ_V_I]]
+//
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
-// CHECK:   ret <4 x i16> [[MUL]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL]]
+//
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmul_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
-// CHECK:   ret <2 x i32> [[MUL]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL]]
+//
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmul_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
-// CHECK:   ret <2 x float> [[MUL]]
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <2 x float> [[MUL]]
+//
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmul_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
-// CHECK:   ret <4 x i16> [[MUL]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL]]
+//
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmul_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
-// CHECK:   ret <2 x i32> [[MUL]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL]]
+//
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmulq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
-// CHECK:   ret <8 x i16> [[MUL]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL]]
+//
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmulq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
-// CHECK:   ret <4 x i32> [[MUL]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL]]
+//
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmulq_lane_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
-// CHECK:   ret <4 x float> [[MUL]]
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x float> [[MUL]]
+//
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmulq_lane_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
-// CHECK:   ret <8 x i16> [[MUL]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL]]
+//
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vmulq_lane_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
-// CHECK:   ret <4 x i32> [[MUL]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL]]
+//
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vmul_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
-// CHECK:   ret <2 x float> [[MUL_I]]
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[VECINIT1_I]]
+// CHECK-NEXT:    ret <2 x float> [[MUL_I]]
+//
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_n_f32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
-// CHECK:   ret <4 x float> [[MUL_I]]
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[VECINIT3_I]]
+// CHECK-NEXT:    ret <4 x float> [[MUL_I]]
+//
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_n_u16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmulq_n_u32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmvn_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: @test_vmvn_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1)
-// CHECK:   ret <4 x i16> [[NEG_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <4 x i16> [[NOT_I]]
+//
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: @test_vmvn_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1)
-// CHECK:   ret <2 x i32> [[NEG_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <2 x i32> [[NOT_I]]
+//
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1)
-// CHECK:   ret <4 x i16> [[NEG_I]]
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <4 x i16> [[NOT_I]]
+//
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1)
-// CHECK:   ret <2 x i32> [[NEG_I]]
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <2 x i32> [[NOT_I]]
+//
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: @test_vmvn_p8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1)
-// CHECK:   ret <8 x i16> [[NEG_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <8 x i16> [[NOT_I]]
+//
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1)
-// CHECK:   ret <4 x i32> [[NEG_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
+//
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1)
-// CHECK:   ret <8 x i16> [[NEG_I]]
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <8 x i16> [[NOT_I]]
+//
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1)
-// CHECK:   ret <4 x i32> [[NEG_I]]
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
+//
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_p8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
-// CHECK-LABEL: @test_vneg_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: @test_vneg_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: @test_vneg_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: @test_vneg_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %a
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[A]]
+// CHECK-NEXT:    ret <2 x float> [[FNEG_I]]
+//
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %a
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[A]]
+// CHECK-NEXT:    ret <4 x float> [[FNEG_I]]
+//
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
-// CHECK-LABEL: @test_vorn_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1)
-// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
-// CHECK:   ret <8 x i8> [[OR_I]]
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i8> [[OR_I]]
+//
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1)
-// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
-// CHECK:   ret <4 x i16> [[OR_I]]
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i16> [[OR_I]]
+//
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1)
-// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
-// CHECK:   ret <2 x i32> [[OR_I]]
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i32> [[OR_I]]
+//
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_s64(
-// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1)
-// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
-// CHECK:   ret <1 x i64> [[OR_I]]
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[OR_I]]
+//
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1)
-// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
-// CHECK:   ret <8 x i8> [[OR_I]]
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i8> [[OR_I]]
+//
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1)
-// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
-// CHECK:   ret <4 x i16> [[OR_I]]
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i16> [[OR_I]]
+//
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1)
-// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
-// CHECK:   ret <2 x i32> [[OR_I]]
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i32> [[OR_I]]
+//
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vorn_u64(
-// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1)
-// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
-// CHECK:   ret <1 x i64> [[OR_I]]
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <1 x i64> [[OR_I]]
+//
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1)
-// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
-// CHECK:   ret <16 x i8> [[OR_I]]
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <16 x i8> [[OR_I]]
+//
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1)
-// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
-// CHECK:   ret <8 x i16> [[OR_I]]
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i16> [[OR_I]]
+//
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1)
-// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
-// CHECK:   ret <4 x i32> [[OR_I]]
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i32> [[OR_I]]
+//
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_s64(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1)
-// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
-// CHECK:   ret <2 x i64> [[OR_I]]
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i64> [[OR_I]]
+//
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1)
-// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
-// CHECK:   ret <16 x i8> [[OR_I]]
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <16 x i8> [[OR_I]]
+//
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1)
-// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
-// CHECK:   ret <8 x i16> [[OR_I]]
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <8 x i16> [[OR_I]]
+//
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1)
-// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
-// CHECK:   ret <4 x i32> [[OR_I]]
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <4 x i32> [[OR_I]]
+//
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vornq_u64(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1)
-// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
-// CHECK:   ret <2 x i64> [[OR_I]]
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1)
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]]
+// CHECK-NEXT:    ret <2 x i64> [[OR_I]]
+//
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_s8(
-// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[OR_I]]
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[OR_I]]
+//
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_s16(
-// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[OR_I]]
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[OR_I]]
+//
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_s32(
-// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[OR_I]]
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[OR_I]]
+//
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_s64(
-// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[OR_I]]
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[OR_I]]
+//
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_u8(
-// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[OR_I]]
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[OR_I]]
+//
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_u16(
-// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[OR_I]]
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[OR_I]]
+//
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_u32(
-// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[OR_I]]
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[OR_I]]
+//
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vorr_u64(
-// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[OR_I]]
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[OR_I]]
+//
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_s8(
-// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[OR_I]]
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[OR_I]]
+//
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_s16(
-// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[OR_I]]
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[OR_I]]
+//
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_s32(
-// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[OR_I]]
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[OR_I]]
+//
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_s64(
-// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[OR_I]]
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[OR_I]]
+//
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_u8(
-// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[OR_I]]
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[OR_I]]
+//
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_u16(
-// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[OR_I]]
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[OR_I]]
+//
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_u32(
-// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[OR_I]]
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[OR_I]]
+//
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vorrq_u64(
-// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[OR_I]]
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[OR_I]]
+//
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
-// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADAL_V1_I]]
+//
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
-// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADAL_V2_I]]
+//
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
-// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADAL_V2_I]]
+//
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
-// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADAL_V1_I]]
+//
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
-// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADAL_V2_I]]
+//
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
-// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADAL_V2_I]]
+//
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
-// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADALQ_V1_I]]
+//
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
-// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADALQ_V2_I]]
+//
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
-// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADALQ_V2_I]]
+//
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
-// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADALQ_V1_I]]
+//
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
-// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADALQ_V2_I]]
+//
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
-// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADALQ_V2_I]]
+//
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s8(
-// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPADD_V_I]]
+//
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
+//
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
+//
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u8(
-// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPADD_V_I]]
+//
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
+//
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
+//
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VPADD_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VPADD_V2_I]]
+//
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddl_s8(
-// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a)
-// CHECK:   ret <4 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADDL_I]]
+//
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a)
-// CHECK:   ret <2 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
+//
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a)
-// CHECK:   ret <1 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
+//
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u8(
-// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a)
-// CHECK:   ret <4 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADDL_I]]
+//
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a)
-// CHECK:   ret <2 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
+//
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a)
-// CHECK:   ret <1 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
+//
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s8(
-// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a)
-// CHECK:   ret <8 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDL_I]]
+//
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a)
-// CHECK:   ret <4 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
+//
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a)
-// CHECK:   ret <2 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
+//
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u8(
-// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a)
-// CHECK:   ret <8 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDL_I]]
+//
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a)
-// CHECK:   ret <4 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
+//
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a)
-// CHECK:   ret <2 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
+//
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
-// CHECK-LABEL: @test_vpmax_s8(
-// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMAX_V_I]]
+//
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPMAX_V2_I]]
+//
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPMAX_V2_I]]
+//
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u8(
-// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMAX_V_I]]
+//
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPMAX_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPMAX_V2_I]]
+//
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPMAX_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPMAX_V2_I]]
+//
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VPMAX_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VPMAX_V2_I]]
+//
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s8(
-// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMIN_V_I]]
+//
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPMIN_V2_I]]
+//
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPMIN_V2_I]]
+//
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u8(
-// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMIN_V_I]]
+//
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPMIN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VPMIN_V2_I]]
+//
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPMIN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VPMIN_V2_I]]
+//
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VPMIN_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VPMIN_V2_I]]
+//
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vqabs_s8(
-// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VQABS_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQABS_V_I]]
+//
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: @test_vqabs_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a)
-// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQABS_V1_I]]
+//
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: @test_vqabs_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a)
-// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQABS_V1_I]]
+//
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s8(
-// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VQABSQ_V_I]]
+//
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a)
-// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQABSQ_V1_I]]
+//
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a)
-// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQABSQ_V1_I]]
+//
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vqadd_s8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQADD_V_I]]
+//
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
+//
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQADD_V_I]]
+//
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
+//
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQADDQ_V_I]]
+//
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
+//
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQADDQ_V_I]]
+//
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
+//
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmlal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
+//
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
+//
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlal_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
+//
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vqdmlal_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
+//
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vqdmlal_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
-// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
+//
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlal_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
-// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
+//
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
+//
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
+//
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
+//
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
+//
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
-// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
+//
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
-// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
+//
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmulh_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+//
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulh_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+//
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+//
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+//
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulh_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+//
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqdmulh_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+//
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+//
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+//
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqdmulh_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I_I]]
+//
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulh_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I_I]]
+//
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
-// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I_I]]
+//
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
-// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I_I]]
+//
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
+//
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
+//
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
+//
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqdmull_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
+//
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqdmull_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
+//
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
+//
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
+//
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
+//
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
+//
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
+//
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
+//
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
+//
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVUN_V1_I]]
+//
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVUN_V1_I]]
+//
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVUN_V1_I]]
+//
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s8(
-// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQNEG_V_I]]
+//
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a)
-// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQNEG_V1_I]]
+//
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a)
-// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQNEG_V1_I]]
+//
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s8(
-// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VQNEGQ_V_I]]
+//
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a)
-// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQNEGQ_V1_I]]
+//
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a)
-// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQNEGQ_V1_I]]
+//
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+//
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+//
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+//
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+//
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+//
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+//
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+//
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+//
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I_I]]
+//
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I_I]]
+//
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_n_s16(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
-// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
-// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
-// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
-// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
-// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I_I]]
+//
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_n_s32(
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
-// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I_I]]
+//
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s8(
-// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHL_V_I]]
+//
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
+//
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
+//
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
+//
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u8(
-// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHL_V_I]]
+//
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
+//
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
+//
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
+//
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s8(
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQRSHLQ_V_I]]
+//
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
+//
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
+//
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
+//
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u8(
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQRSHLQ_V_I]]
+//
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
+//
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
+//
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
+//
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
+//
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
+//
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
+//
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
+//
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
+//
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
+//
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRUN_N1]]
+//
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRUN_N1]]
+//
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRUN_N1]]
+//
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_s8(
-// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_V_I]]
+//
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
+//
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
+//
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
+//
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u8(
-// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_V_I]]
+//
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
+//
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
+//
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
+//
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s8(
-// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLQ_V_I]]
+//
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
+//
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
+//
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
+//
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u8(
-// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLQ_V_I]]
+//
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
+//
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
+//
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
+//
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s8(
-// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1)
-// CHECK:   ret <8 x i8> [[VQSHLU_N]]
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHLU_N]]
+//
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1)
-// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHLU_N1]]
+//
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1))
-// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHLU_N1]]
+//
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHLU_N1]]
+//
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s8(
-// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VQSHLU_N]]
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLU_N]]
+//
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLU_N1]]
+//
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1))
-// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLU_N1]]
+//
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1))
-// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLU_N1]]
+//
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s8(
-// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1))
-// CHECK:   ret <8 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_N]]
+//
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
-// CHECK:   ret <4 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
+//
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
-// CHECK:   ret <2 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
+//
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
+//
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u8(
-// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1))
-// CHECK:   ret <8 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_N]]
+//
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
-// CHECK:   ret <4 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
+//
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
-// CHECK:   ret <2 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
+//
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
+//
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s8(
-// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VQSHL_N]]
+//
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
+//
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
-// CHECK:   ret <4 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
+//
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
-// CHECK:   ret <2 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
+//
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u8(
-// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VQSHL_N]]
+//
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
+//
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
-// CHECK:   ret <4 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
+//
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
-// CHECK:   ret <2 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
+//
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
+//
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
+//
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
+//
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
+//
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
+//
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
+//
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRUN_N1]]
+//
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRUN_N1]]
+//
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRUN_N1]]
+//
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqsub_s8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSUB_V_I]]
+//
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
+//
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSUB_V_I]]
+//
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
+//
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSUBQ_V_I]]
+//
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
+//
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSUBQ_V_I]]
+//
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
+//
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
+//
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
+//
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
+//
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
+//
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
+//
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
+//
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrecpe_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRECPE_V1_I]]
+//
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecpe_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VRECPE_V1_I]]
+//
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: @test_vrecpeq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRECPEQ_V1_I]]
+//
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecpeq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VRECPEQ_V1_I]]
+//
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
-// CHECK-LABEL: @test_vrecps_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VRECPS_V2_I]]
+//
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vrecpsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VRECPSQ_V2_I]]
+//
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_p8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_p16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u32(
-// CHECK:   ret <2 x i32> %a
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i32> [[A]]
+//
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_p8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_p16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s32(
-// CHECK:   ret <2 x i32> %a
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i32> [[A]]
+//
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_p8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_p16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u32(
-// CHECK:   ret <4 x i32> %a
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i32> [[A]]
+//
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_p8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_p16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s32(
-// CHECK:   ret <4 x i32> %a
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i32> [[A]]
+//
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev16_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev16_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev16_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev32_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev32_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: @test_vrev64_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
-// CHECK-LABEL: @test_vrhadd_s8(
-// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
+//
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrhadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
+//
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrhadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
+//
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrhadd_u8(
-// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
+//
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrhadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
+//
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrhadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
+//
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s8(
-// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
+//
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
+//
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
+//
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u8(
-// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
+//
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
+//
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
+//
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s8(
-// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSHL_V_I]]
+//
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
+//
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
+//
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
+//
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u8(
-// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSHL_V_I]]
+//
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
+//
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
+//
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
+//
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s8(
-// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRSHLQ_V_I]]
+//
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
+//
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
+//
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
+//
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u8(
-// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRSHLQ_V_I]]
+//
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
+//
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
+//
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
+//
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
+//
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
+//
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
+//
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
+//
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
+//
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
+//
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1))
-// CHECK:   ret <8 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHR_N]]
+//
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
-// CHECK:   ret <4 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
+//
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
-// CHECK:   ret <2 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
+//
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
+//
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1))
-// CHECK:   ret <8 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHR_N]]
+//
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
-// CHECK:   ret <4 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
+//
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
-// CHECK:   ret <2 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
+//
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
+//
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1))
-// CHECK:   ret <16 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <16 x i8> [[VRSHR_N]]
+//
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
+//
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
+//
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
+//
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1))
-// CHECK:   ret <16 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <16 x i8> [[VRSHR_N]]
+//
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
+//
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
+//
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
+//
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrsqrte_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRSQRTE_V1_I]]
+//
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrte_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSQRTE_V1_I]]
+//
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrteq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRSQRTEQ_V1_I]]
+//
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrteq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
+// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VRSQRTEQ_V1_I]]
+//
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrts_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
+// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VRSQRTS_V2_I]]
+//
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   return vrsqrts_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsqrtsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
+// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VRSQRTSQ_V2_I]]
+//
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return vrsqrtsq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s8(
-// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
-// CHECK:   ret <8 x i8> [[VRSRA_N]]
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i8> [[VRSRA_N]]
+//
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
-// CHECK:   ret <4 x i16> [[VRSRA_N]]
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <4 x i16> [[VRSRA_N]]
+//
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-// CHECK:   ret <2 x i32> [[VRSRA_N]]
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <2 x i32> [[VRSRA_N]]
+//
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
-// CHECK:   ret <1 x i64> [[VRSRA_N]]
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <1 x i64> [[VRSRA_N]]
+//
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u8(
-// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
-// CHECK:   ret <8 x i8> [[VRSRA_N]]
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i8> [[VRSRA_N]]
+//
 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
-// CHECK:   ret <4 x i16> [[VRSRA_N]]
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <4 x i16> [[VRSRA_N]]
+//
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-// CHECK:   ret <2 x i32> [[VRSRA_N]]
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <2 x i32> [[VRSRA_N]]
+//
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
-// CHECK:   ret <1 x i64> [[VRSRA_N]]
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <1 x i64> [[VRSRA_N]]
+//
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s8(
-// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
-// CHECK:   ret <16 x i8> [[VRSRA_N]]
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <16 x i8> [[VRSRA_N]]
+//
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-// CHECK:   ret <8 x i16> [[VRSRA_N]]
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <8 x i16> [[VRSRA_N]]
+//
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-// CHECK:   ret <4 x i32> [[VRSRA_N]]
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <4 x i32> [[VRSRA_N]]
+//
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-// CHECK:   ret <2 x i64> [[VRSRA_N]]
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <2 x i64> [[VRSRA_N]]
+//
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u8(
-// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
-// CHECK:   ret <16 x i8> [[VRSRA_N]]
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <16 x i8> [[VRSRA_N]]
+//
 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-// CHECK:   ret <8 x i16> [[VRSRA_N]]
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <8 x i16> [[VRSRA_N]]
+//
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-// CHECK:   ret <4 x i32> [[VRSRA_N]]
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <4 x i32> [[VRSRA_N]]
+//
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
-// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-// CHECK:   ret <2 x i64> [[VRSRA_N]]
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <2 x i64> [[VRSRA_N]]
+//
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
+//
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
+//
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
+//
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
+//
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
+//
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
+//
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vset_lane_u8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
+//
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   return vset_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vset_lane_u16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
+//
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   return vset_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vset_lane_u32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VSET_LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VSET_LANE]]
+//
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   return vset_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vset_lane_s8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
+//
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   return vset_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vset_lane_s16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
+//
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   return vset_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vset_lane_s32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1
-// CHECK:   ret <2 x i32> [[VSET_LANE]]
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VSET_LANE]]
+//
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   return vset_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vset_lane_p8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
-// CHECK:   ret <8 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
+//
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   return vset_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vset_lane_p16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3
-// CHECK:   ret <4 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
+//
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   return vset_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vset_lane_f32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> %b, float %a, i32 1
-// CHECK:   ret <2 x float> [[VSET_LANE]]
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x float> [[B]], float [[A]], i32 1
+// CHECK-NEXT:    ret <2 x float> [[VSET_LANE]]
+//
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   return vset_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vset_lane_f16(
-// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
-// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
-// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   store half [[TMP0]], ptr [[__REINT_246]], align 2
-// CHECK:   store <4 x half> %b, ptr [[__REINT1_246]], align 8
-// CHECK:   [[TMP2:%.*]] = load i16, ptr [[__REINT_246]], align 2
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT1_246]], align 8
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[TMP2]], i32 1
-// CHECK:   store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_246]], align 8
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[__REINT2_246]], align 8
-// CHECK:   ret <4 x half> [[TMP8]]
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_853:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__REINT1_853:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT2_853:%.*]] = alloca <4 x i16>, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    store half [[TMP0]], ptr [[__REINT_853]], align 2
+// CHECK-NEXT:    store <4 x half> [[B]], ptr [[__REINT1_853]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__REINT_853]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT1_853]], align 8
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP1]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_853]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr [[__REINT2_853]], align 8
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_u8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
+//
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   return vsetq_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_u16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
+//
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   return vsetq_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_u32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VSET_LANE]]
+//
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   return vsetq_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_s8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
+//
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   return vsetq_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_s16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
+//
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   return vsetq_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_s32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3
-// CHECK:   ret <4 x i32> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VSET_LANE]]
+//
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   return vsetq_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_p8(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
-// CHECK:   ret <16 x i8> [[VSET_LANE]]
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(
+// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
+//
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   return vsetq_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_p16(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7
-// CHECK:   ret <8 x i16> [[VSET_LANE]]
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(
+// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
+//
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   return vsetq_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_f32(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> %b, float %a, i32 3
-// CHECK:   ret <4 x float> [[VSET_LANE]]
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x float> [[B]], float [[A]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VSET_LANE]]
+//
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   return vsetq_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_f16(
-// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
-// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
-// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a, align 2
-// CHECK:   store half [[TMP0]], ptr [[__REINT_248]], align 2
-// CHECK:   store <8 x half> %b, ptr [[__REINT1_248]], align 16
-// CHECK:   [[TMP2:%.*]] = load i16, ptr [[__REINT_248]], align 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT1_248]], align 16
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP2]], i32 3
-// CHECK:   store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_248]], align 16
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[__REINT2_248]], align 16
-// CHECK:   ret <8 x half> [[TMP8]]
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__REINT_855:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__REINT1_855:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT2_855:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    store half [[TMP0]], ptr [[__REINT_855]], align 2
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[__REINT1_855]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__REINT_855]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT1_855]], align 16
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP1]], i32 3
+// CHECK-NEXT:    store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_855]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__REINT2_855]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
 }
 
-// CHECK-LABEL: @test_vset_lane_s64(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0
-// CHECK:   ret <1 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VSET_LANE]]
+//
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   return vset_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vset_lane_u64(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0
-// CHECK:   ret <1 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VSET_LANE]]
+//
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   return vset_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_s64(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
+//
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   return vsetq_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsetq_lane_u64(
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
+//
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   return vsetq_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vshl_s8(
-// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_V_I]]
+//
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
+//
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
+//
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
+//
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u8(
-// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VSHL_V_I]]
+// CHECK-LABEL: define <8 x i8> @test_vshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_V_I]]
+//
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
+//
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
+//
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
+// CHECK-LABEL: define <1 x i64> @test_vshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
+//
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s8(
-// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VSHLQ_V_I]]
+//
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
+//
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
+//
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
+//
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u8(
-// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
+// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VSHLQ_V_I]]
+//
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
+//
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
+//
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
+//
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshll_n_s8(
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshll_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshll_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshll_n_u8(
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshll_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshll_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_s8(
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1)
-// CHECK:   ret <8 x i8> [[VSHL_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_N]]
+//
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <4 x i16> [[VSHL_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
+//
 int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <2 x i32> [[VSHL_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
+//
 int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHL_N]]
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
+//
 int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_u8(
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1)
-// CHECK:   ret <8 x i8> [[VSHL_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_N]]
+//
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <4 x i16> [[VSHL_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
+//
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <2 x i32> [[VSHL_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
+//
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHL_N]]
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
+//
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s8(
-// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1)
-// CHECK:   ret <16 x i8> [[VSHL_N]]
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+//
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHL_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+//
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHL_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+//
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHL_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
+//
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u8(
-// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1)
-// CHECK:   ret <16 x i8> [[VSHL_N]]
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+//
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHL_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+//
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHL_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+//
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHL_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
+//
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSHRN_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
+//
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSHRN_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
+//
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSHRN_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
+//
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSHRN_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
+//
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSHRN_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
+//
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSHRN_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
+//
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_s8(
-// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 1)
-// CHECK:   ret <8 x i8> [[VSHR_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <8 x i8> [[VSHR_N]]
+//
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <4 x i16> [[VSHR_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
+//
 int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <2 x i32> [[VSHR_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
+//
 int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHR_N]]
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
+//
 int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_u8(
-// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 1)
-// CHECK:   ret <8 x i8> [[VSHR_N]]
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <8 x i8> [[VSHR_N]]
+//
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <4 x i16> [[VSHR_N]]
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
+//
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <2 x i32> [[VSHR_N]]
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
+//
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHR_N]]
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
+//
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s8(
-// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 1)
-// CHECK:   ret <16 x i8> [[VSHR_N]]
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <16 x i8> [[VSHR_N]]
+//
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHR_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
+//
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHR_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
+//
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHR_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
+//
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u8(
-// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 1)
-// CHECK:   ret <16 x i8> [[VSHR_N]]
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    ret <16 x i8> [[VSHR_N]]
+//
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK:   ret <8 x i16> [[VSHR_N]]
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
+//
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK:   ret <4 x i32> [[VSHR_N]]
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
+//
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <2 x i64> [[VSHR_N]]
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
+//
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsli_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsliq_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_s8(
-// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 1)
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1)
-// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i16> [[TMP4]]
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+//
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1)
-// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i32> [[TMP4]]
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+//
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <1 x i64> [[TMP4]]
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP4]]
+//
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_u8(
-// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 1)
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
-// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i16> [[TMP4]]
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+//
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1)
-// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i32> [[TMP4]]
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+//
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsra_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <1 x i64> [[TMP4]]
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP4]]
+//
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s8(
-// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 1)
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1)
-// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <8 x i16> [[TMP4]]
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1)
-// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i32> [[TMP4]]
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i64> [[TMP4]]
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+//
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u8(
-// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 1)
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1)
-// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <8 x i16> [[TMP4]]
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+//
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1)
-// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i32> [[TMP4]]
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+//
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i64> [[TMP4]]
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1)
+// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+//
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1))
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsri_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1))
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1))
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsriq_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1q_u8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s16(int16_t * a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s32(int32_t * a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s64(int64_t * a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8f16(ptr %a, <8 x half> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8f16(ptr [[A]], <8 x half> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4f32(ptr %a, <4 x float> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f32(ptr [[A]], <4 x float> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f32(float32_t * a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s8(int8_t * a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s16(int16_t * a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s32(int32_t * a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s64(int64_t * a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4f16(ptr %a, <4 x half> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f16(ptr [[A]], <4 x half> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v2f32(ptr %a, <2 x float> [[TMP2]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2f32(ptr [[A]], <2 x float> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f32(float32_t * a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p8(
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_u8(
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   store i32 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> <i32 1>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_s8(
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   store i32 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
-// CHECK:   call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> <i32 1>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
-// CHECK:   store half [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
+// CHECK-NEXT:    store half [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-// CHECK:   store float [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK-NEXT:    store float [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_p8(
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vst1q_lane_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1_lane_u8(
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1_lane_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1_lane_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   store i32 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1_lane_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   store i64 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vst1_lane_s8(
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1_lane_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1_lane_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   store i32 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1_lane_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   store i64 [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vst1_lane_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
-// CHECK:   store half [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
+// CHECK-NEXT:    store half [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst1_lane_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-// CHECK:   store float [[TMP3]], ptr %a, align 4
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK-NEXT:    store float [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst1_lane_p8(
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a, align 1
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst1_lane_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a, align 2
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst1_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst2_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst2_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst2_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst2_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst2_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst2_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8f16(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f32(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst3_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst3_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst3_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst3_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst3_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst3_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8f16(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f32(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4_lane_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst4_lane_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst4_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vst4_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vst4_lane_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: @test_vst4_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vst4_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsub_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define <8 x i8> @test_vsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   return vsub_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   return vsub_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   return vsub_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[SUB_I]]
+// CHECK-LABEL: define <1 x i64> @test_vsub_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[SUB_I]]
+//
 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   return vsub_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_f32(
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define <2 x float> @test_vsub_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   return vsub_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_u8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define <8 x i8> @test_vsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   return vsub_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_u16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define <4 x i16> @test_vsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   return vsub_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_u32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define <2 x i32> @test_vsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   return vsub_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsub_u64(
-// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
-// CHECK:   ret <1 x i64> [[SUB_I]]
+// CHECK-LABEL: define <1 x i64> @test_vsub_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[SUB_I]]
+//
 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   return vsub_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   return vsubq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   return vsubq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   return vsubq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   return vsubq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_f32(
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define <4 x float> @test_vsubq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   return vsubq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_u8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vsubq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_u16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vsubq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_u32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vsubq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubq_u64(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vsubq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
+//
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
+//
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
+//
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
+//
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
+//
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
+//
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl1_u8(
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl1_s8(
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl1_p8(
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl2_u8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8(
+// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl2_s8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8(
+// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl2_p8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8(
+// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl3_u8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8(
+// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl3_s8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8(
+// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl3_p8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8(
+// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl4_u8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8(
+// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl4_s8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8(
+// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbl4_p8(
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8(
+// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtbx1_u8(
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx1_s8(
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx1_p8(
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx2_u8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx2_s8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx2_p8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx3_u8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx3_s8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx3_p8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx4_u8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx4_s8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vtbx4_p8(
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK:   store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK:   [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-// CHECK: @test_vtrn_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]]
+// CHECK-NEXT:    ret void
+//
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK: @test_vtrn_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]]
+// CHECK-NEXT:    ret void
+//
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK: @test_vtrn_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]]
+// CHECK-NEXT:    ret void
+//
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK: @test_vtrn_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]]
+// CHECK-NEXT:    ret void
+//
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK: @test_vtrn_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]]
+// CHECK-NEXT:    ret void
+//
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK: @test_vtrn_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]]
+// CHECK-NEXT:    ret void
+//
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK: @test_vtrn_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META21]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x float> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META21]]
+// CHECK-NEXT:    ret void
+//
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK: @test_vtrn_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META24]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META24]]
+// CHECK-NEXT:    ret void
+//
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK: @test_vtrn_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrn_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META27]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META27]]
+// CHECK-NEXT:    ret void
+//
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK: @test_vtrnq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META30]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META30]]
+// CHECK-NEXT:    ret void
+//
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK: @test_vtrnq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META33]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META33]]
+// CHECK-NEXT:    ret void
+//
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK: @test_vtrnq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META36]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META36]]
+// CHECK-NEXT:    ret void
+//
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK: @test_vtrnq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META39:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META39]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META39]]
+// CHECK-NEXT:    ret void
+//
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK: @test_vtrnq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META42:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META42]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META42]]
+// CHECK-NEXT:    ret void
+//
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK: @test_vtrnq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META45]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META45]]
+// CHECK-NEXT:    ret void
+//
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK: @test_vtrnq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META48:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META48]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x float> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META48]]
+// CHECK-NEXT:    ret void
+//
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK: @test_vtrnq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META51:![0-9]+]])
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META51]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META51]]
+// CHECK-NEXT:    ret void
+//
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK: @test_vtrnq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vtrnq_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META54:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META54]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META54]]
+// CHECK-NEXT:    ret void
+//
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_s8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtst_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   return vtst_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define <4 x i16> @test_vtst_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   return vtst_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VTST_I]]
+// CHECK-LABEL: define <2 x i32> @test_vtst_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
+//
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   return vtst_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_u8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtst_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   return vtst_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define <4 x i16> @test_vtst_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   return vtst_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i32> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VTST_I]]
+// CHECK-LABEL: define <2 x i32> @test_vtst_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
+//
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   return vtst_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_p8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define <8 x i8> @test_vtst_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   return vtst_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define <4 x i16> @test_vtst_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   return vtst_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_s8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   return vtstq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   return vtstq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VTST_I]]
+// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
+//
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   return vtstq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_u8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   return vtstq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   return vtstq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i32> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VTST_I]]
+// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
+//
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   return vtstq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_p8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   return vtstq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtstq_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
-// CHECK: @test_vuzp_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META57]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META57]]
+// CHECK-NEXT:    ret void
+//
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK: @test_vuzp_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META60]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META60]]
+// CHECK-NEXT:    ret void
+//
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK: @test_vuzp_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META63:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META63]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META63]]
+// CHECK-NEXT:    ret void
+//
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK: @test_vuzp_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META66:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META66]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META66]]
+// CHECK-NEXT:    ret void
+//
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK: @test_vuzp_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META69:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META69]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META69]]
+// CHECK-NEXT:    ret void
+//
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK: @test_vuzp_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META72:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META72]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META72]]
+// CHECK-NEXT:    ret void
+//
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK: @test_vuzp_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META75:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META75]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x float> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META75]]
+// CHECK-NEXT:    ret void
+//
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK: @test_vuzp_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META78]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META78]]
+// CHECK-NEXT:    ret void
+//
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK: @test_vuzp_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzp_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META81:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META81]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META81]]
+// CHECK-NEXT:    ret void
+//
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK: @test_vuzpq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META84:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META84]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META84]]
+// CHECK-NEXT:    ret void
+//
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK: @test_vuzpq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META87:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META87]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META87]]
+// CHECK-NEXT:    ret void
+//
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK: @test_vuzpq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META90]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META90]]
+// CHECK-NEXT:    ret void
+//
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK: @test_vuzpq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META93:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META93]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META93]]
+// CHECK-NEXT:    ret void
+//
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK: @test_vuzpq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META96:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META96]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META96]]
+// CHECK-NEXT:    ret void
+//
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK: @test_vuzpq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META99:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META99]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META99]]
+// CHECK-NEXT:    ret void
+//
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK: @test_vuzpq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META102]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x float> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META102]]
+// CHECK-NEXT:    ret void
+//
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK: @test_vuzpq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META105:![0-9]+]])
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META105]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META105]]
+// CHECK-NEXT:    ret void
+//
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK: @test_vuzpq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vuzpq_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META108:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META108]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META108]]
+// CHECK-NEXT:    ret void
+//
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-// CHECK: @test_vzip_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META111:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META111]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META111]]
+// CHECK-NEXT:    ret void
+//
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK: @test_vzip_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META114:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META114]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META114]]
+// CHECK-NEXT:    ret void
+//
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK: @test_vzip_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META117:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META117]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META117]]
+// CHECK-NEXT:    ret void
+//
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK: @test_vzip_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META120]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META120]]
+// CHECK-NEXT:    ret void
+//
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK: @test_vzip_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META123:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META123]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META123]]
+// CHECK-NEXT:    ret void
+//
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK: @test_vzip_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META126:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META126]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META126]]
+// CHECK-NEXT:    ret void
+//
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK: @test_vzip_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META129:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META129]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    store <2 x float> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META129]]
+// CHECK-NEXT:    ret void
+//
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK: @test_vzip_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META132:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META132]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META132]]
+// CHECK-NEXT:    ret void
+//
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK: @test_vzip_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzip_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META135:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META135]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META135]]
+// CHECK-NEXT:    ret void
+//
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK: @test_vzipq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_s8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META138]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META138]]
+// CHECK-NEXT:    ret void
+//
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK: @test_vzipq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_s16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META141:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META141]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META141]]
+// CHECK-NEXT:    ret void
+//
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK: @test_vzipq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_s32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META144:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META144]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META144]]
+// CHECK-NEXT:    ret void
+//
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK: @test_vzipq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_u8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META147:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META147]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META147]]
+// CHECK-NEXT:    ret void
+//
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK: @test_vzipq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_u16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META150:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META150]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META150]]
+// CHECK-NEXT:    ret void
+//
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK: @test_vzipq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_u32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META153:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META153]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META153]]
+// CHECK-NEXT:    ret void
+//
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK: @test_vzipq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_f32(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META156:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META156]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x float> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META156]]
+// CHECK-NEXT:    ret void
+//
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK: @test_vzipq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_p8(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META159:![0-9]+]])
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META159]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META159]]
+// CHECK-NEXT:    ret void
+//
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK: @test_vzipq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]],
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope
-// CHECK:   ret void
+// CHECK-LABEL: define void @test_vzipq_p16(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META162:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META162]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META162]]
+// CHECK-NEXT:    ret void
+//
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
+//.
+// CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+// CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"vtrn_s8: %agg.result"}
+// CHECK: [[META5]] = distinct !{[[META5]], !"vtrn_s8"}
+// CHECK: [[META6]] = !{[[META7:![0-9]+]]}
+// CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"vtrn_s16: %agg.result"}
+// CHECK: [[META8]] = distinct !{[[META8]], !"vtrn_s16"}
+// CHECK: [[META9]] = !{[[META10:![0-9]+]]}
+// CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"vtrn_s32: %agg.result"}
+// CHECK: [[META11]] = distinct !{[[META11]], !"vtrn_s32"}
+// CHECK: [[META12]] = !{[[META13:![0-9]+]]}
+// CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"vtrn_u8: %agg.result"}
+// CHECK: [[META14]] = distinct !{[[META14]], !"vtrn_u8"}
+// CHECK: [[META15]] = !{[[META16:![0-9]+]]}
+// CHECK: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"vtrn_u16: %agg.result"}
+// CHECK: [[META17]] = distinct !{[[META17]], !"vtrn_u16"}
+// CHECK: [[META18]] = !{[[META19:![0-9]+]]}
+// CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"vtrn_u32: %agg.result"}
+// CHECK: [[META20]] = distinct !{[[META20]], !"vtrn_u32"}
+// CHECK: [[META21]] = !{[[META22:![0-9]+]]}
+// CHECK: [[META22]] = distinct !{[[META22]], [[META23:![0-9]+]], !"vtrn_f32: %agg.result"}
+// CHECK: [[META23]] = distinct !{[[META23]], !"vtrn_f32"}
+// CHECK: [[META24]] = !{[[META25:![0-9]+]]}
+// CHECK: [[META25]] = distinct !{[[META25]], [[META26:![0-9]+]], !"vtrn_p8: %agg.result"}
+// CHECK: [[META26]] = distinct !{[[META26]], !"vtrn_p8"}
+// CHECK: [[META27]] = !{[[META28:![0-9]+]]}
+// CHECK: [[META28]] = distinct !{[[META28]], [[META29:![0-9]+]], !"vtrn_p16: %agg.result"}
+// CHECK: [[META29]] = distinct !{[[META29]], !"vtrn_p16"}
+// CHECK: [[META30]] = !{[[META31:![0-9]+]]}
+// CHECK: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !"vtrnq_s8: %agg.result"}
+// CHECK: [[META32]] = distinct !{[[META32]], !"vtrnq_s8"}
+// CHECK: [[META33]] = !{[[META34:![0-9]+]]}
+// CHECK: [[META34]] = distinct !{[[META34]], [[META35:![0-9]+]], !"vtrnq_s16: %agg.result"}
+// CHECK: [[META35]] = distinct !{[[META35]], !"vtrnq_s16"}
+// CHECK: [[META36]] = !{[[META37:![0-9]+]]}
+// CHECK: [[META37]] = distinct !{[[META37]], [[META38:![0-9]+]], !"vtrnq_s32: %agg.result"}
+// CHECK: [[META38]] = distinct !{[[META38]], !"vtrnq_s32"}
+// CHECK: [[META39]] = !{[[META40:![0-9]+]]}
+// CHECK: [[META40]] = distinct !{[[META40]], [[META41:![0-9]+]], !"vtrnq_u8: %agg.result"}
+// CHECK: [[META41]] = distinct !{[[META41]], !"vtrnq_u8"}
+// CHECK: [[META42]] = !{[[META43:![0-9]+]]}
+// CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"vtrnq_u16: %agg.result"}
+// CHECK: [[META44]] = distinct !{[[META44]], !"vtrnq_u16"}
+// CHECK: [[META45]] = !{[[META46:![0-9]+]]}
+// CHECK: [[META46]] = distinct !{[[META46]], [[META47:![0-9]+]], !"vtrnq_u32: %agg.result"}
+// CHECK: [[META47]] = distinct !{[[META47]], !"vtrnq_u32"}
+// CHECK: [[META48]] = !{[[META49:![0-9]+]]}
+// CHECK: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"vtrnq_f32: %agg.result"}
+// CHECK: [[META50]] = distinct !{[[META50]], !"vtrnq_f32"}
+// CHECK: [[META51]] = !{[[META52:![0-9]+]]}
+// CHECK: [[META52]] = distinct !{[[META52]], [[META53:![0-9]+]], !"vtrnq_p8: %agg.result"}
+// CHECK: [[META53]] = distinct !{[[META53]], !"vtrnq_p8"}
+// CHECK: [[META54]] = !{[[META55:![0-9]+]]}
+// CHECK: [[META55]] = distinct !{[[META55]], [[META56:![0-9]+]], !"vtrnq_p16: %agg.result"}
+// CHECK: [[META56]] = distinct !{[[META56]], !"vtrnq_p16"}
+// CHECK: [[META57]] = !{[[META58:![0-9]+]]}
+// CHECK: [[META58]] = distinct !{[[META58]], [[META59:![0-9]+]], !"vuzp_s8: %agg.result"}
+// CHECK: [[META59]] = distinct !{[[META59]], !"vuzp_s8"}
+// CHECK: [[META60]] = !{[[META61:![0-9]+]]}
+// CHECK: [[META61]] = distinct !{[[META61]], [[META62:![0-9]+]], !"vuzp_s16: %agg.result"}
+// CHECK: [[META62]] = distinct !{[[META62]], !"vuzp_s16"}
+// CHECK: [[META63]] = !{[[META64:![0-9]+]]}
+// CHECK: [[META64]] = distinct !{[[META64]], [[META65:![0-9]+]], !"vuzp_s32: %agg.result"}
+// CHECK: [[META65]] = distinct !{[[META65]], !"vuzp_s32"}
+// CHECK: [[META66]] = !{[[META67:![0-9]+]]}
+// CHECK: [[META67]] = distinct !{[[META67]], [[META68:![0-9]+]], !"vuzp_u8: %agg.result"}
+// CHECK: [[META68]] = distinct !{[[META68]], !"vuzp_u8"}
+// CHECK: [[META69]] = !{[[META70:![0-9]+]]}
+// CHECK: [[META70]] = distinct !{[[META70]], [[META71:![0-9]+]], !"vuzp_u16: %agg.result"}
+// CHECK: [[META71]] = distinct !{[[META71]], !"vuzp_u16"}
+// CHECK: [[META72]] = !{[[META73:![0-9]+]]}
+// CHECK: [[META73]] = distinct !{[[META73]], [[META74:![0-9]+]], !"vuzp_u32: %agg.result"}
+// CHECK: [[META74]] = distinct !{[[META74]], !"vuzp_u32"}
+// CHECK: [[META75]] = !{[[META76:![0-9]+]]}
+// CHECK: [[META76]] = distinct !{[[META76]], [[META77:![0-9]+]], !"vuzp_f32: %agg.result"}
+// CHECK: [[META77]] = distinct !{[[META77]], !"vuzp_f32"}
+// CHECK: [[META78]] = !{[[META79:![0-9]+]]}
+// CHECK: [[META79]] = distinct !{[[META79]], [[META80:![0-9]+]], !"vuzp_p8: %agg.result"}
+// CHECK: [[META80]] = distinct !{[[META80]], !"vuzp_p8"}
+// CHECK: [[META81]] = !{[[META82:![0-9]+]]}
+// CHECK: [[META82]] = distinct !{[[META82]], [[META83:![0-9]+]], !"vuzp_p16: %agg.result"}
+// CHECK: [[META83]] = distinct !{[[META83]], !"vuzp_p16"}
+// CHECK: [[META84]] = !{[[META85:![0-9]+]]}
+// CHECK: [[META85]] = distinct !{[[META85]], [[META86:![0-9]+]], !"vuzpq_s8: %agg.result"}
+// CHECK: [[META86]] = distinct !{[[META86]], !"vuzpq_s8"}
+// CHECK: [[META87]] = !{[[META88:![0-9]+]]}
+// CHECK: [[META88]] = distinct !{[[META88]], [[META89:![0-9]+]], !"vuzpq_s16: %agg.result"}
+// CHECK: [[META89]] = distinct !{[[META89]], !"vuzpq_s16"}
+// CHECK: [[META90]] = !{[[META91:![0-9]+]]}
+// CHECK: [[META91]] = distinct !{[[META91]], [[META92:![0-9]+]], !"vuzpq_s32: %agg.result"}
+// CHECK: [[META92]] = distinct !{[[META92]], !"vuzpq_s32"}
+// CHECK: [[META93]] = !{[[META94:![0-9]+]]}
+// CHECK: [[META94]] = distinct !{[[META94]], [[META95:![0-9]+]], !"vuzpq_u8: %agg.result"}
+// CHECK: [[META95]] = distinct !{[[META95]], !"vuzpq_u8"}
+// CHECK: [[META96]] = !{[[META97:![0-9]+]]}
+// CHECK: [[META97]] = distinct !{[[META97]], [[META98:![0-9]+]], !"vuzpq_u16: %agg.result"}
+// CHECK: [[META98]] = distinct !{[[META98]], !"vuzpq_u16"}
+// CHECK: [[META99]] = !{[[META100:![0-9]+]]}
+// CHECK: [[META100]] = distinct !{[[META100]], [[META101:![0-9]+]], !"vuzpq_u32: %agg.result"}
+// CHECK: [[META101]] = distinct !{[[META101]], !"vuzpq_u32"}
+// CHECK: [[META102]] = !{[[META103:![0-9]+]]}
+// CHECK: [[META103]] = distinct !{[[META103]], [[META104:![0-9]+]], !"vuzpq_f32: %agg.result"}
+// CHECK: [[META104]] = distinct !{[[META104]], !"vuzpq_f32"}
+// CHECK: [[META105]] = !{[[META106:![0-9]+]]}
+// CHECK: [[META106]] = distinct !{[[META106]], [[META107:![0-9]+]], !"vuzpq_p8: %agg.result"}
+// CHECK: [[META107]] = distinct !{[[META107]], !"vuzpq_p8"}
+// CHECK: [[META108]] = !{[[META109:![0-9]+]]}
+// CHECK: [[META109]] = distinct !{[[META109]], [[META110:![0-9]+]], !"vuzpq_p16: %agg.result"}
+// CHECK: [[META110]] = distinct !{[[META110]], !"vuzpq_p16"}
+// CHECK: [[META111]] = !{[[META112:![0-9]+]]}
+// CHECK: [[META112]] = distinct !{[[META112]], [[META113:![0-9]+]], !"vzip_s8: %agg.result"}
+// CHECK: [[META113]] = distinct !{[[META113]], !"vzip_s8"}
+// CHECK: [[META114]] = !{[[META115:![0-9]+]]}
+// CHECK: [[META115]] = distinct !{[[META115]], [[META116:![0-9]+]], !"vzip_s16: %agg.result"}
+// CHECK: [[META116]] = distinct !{[[META116]], !"vzip_s16"}
+// CHECK: [[META117]] = !{[[META118:![0-9]+]]}
+// CHECK: [[META118]] = distinct !{[[META118]], [[META119:![0-9]+]], !"vzip_s32: %agg.result"}
+// CHECK: [[META119]] = distinct !{[[META119]], !"vzip_s32"}
+// CHECK: [[META120]] = !{[[META121:![0-9]+]]}
+// CHECK: [[META121]] = distinct !{[[META121]], [[META122:![0-9]+]], !"vzip_u8: %agg.result"}
+// CHECK: [[META122]] = distinct !{[[META122]], !"vzip_u8"}
+// CHECK: [[META123]] = !{[[META124:![0-9]+]]}
+// CHECK: [[META124]] = distinct !{[[META124]], [[META125:![0-9]+]], !"vzip_u16: %agg.result"}
+// CHECK: [[META125]] = distinct !{[[META125]], !"vzip_u16"}
+// CHECK: [[META126]] = !{[[META127:![0-9]+]]}
+// CHECK: [[META127]] = distinct !{[[META127]], [[META128:![0-9]+]], !"vzip_u32: %agg.result"}
+// CHECK: [[META128]] = distinct !{[[META128]], !"vzip_u32"}
+// CHECK: [[META129]] = !{[[META130:![0-9]+]]}
+// CHECK: [[META130]] = distinct !{[[META130]], [[META131:![0-9]+]], !"vzip_f32: %agg.result"}
+// CHECK: [[META131]] = distinct !{[[META131]], !"vzip_f32"}
+// CHECK: [[META132]] = !{[[META133:![0-9]+]]}
+// CHECK: [[META133]] = distinct !{[[META133]], [[META134:![0-9]+]], !"vzip_p8: %agg.result"}
+// CHECK: [[META134]] = distinct !{[[META134]], !"vzip_p8"}
+// CHECK: [[META135]] = !{[[META136:![0-9]+]]}
+// CHECK: [[META136]] = distinct !{[[META136]], [[META137:![0-9]+]], !"vzip_p16: %agg.result"}
+// CHECK: [[META137]] = distinct !{[[META137]], !"vzip_p16"}
+// CHECK: [[META138]] = !{[[META139:![0-9]+]]}
+// CHECK: [[META139]] = distinct !{[[META139]], [[META140:![0-9]+]], !"vzipq_s8: %agg.result"}
+// CHECK: [[META140]] = distinct !{[[META140]], !"vzipq_s8"}
+// CHECK: [[META141]] = !{[[META142:![0-9]+]]}
+// CHECK: [[META142]] = distinct !{[[META142]], [[META143:![0-9]+]], !"vzipq_s16: %agg.result"}
+// CHECK: [[META143]] = distinct !{[[META143]], !"vzipq_s16"}
+// CHECK: [[META144]] = !{[[META145:![0-9]+]]}
+// CHECK: [[META145]] = distinct !{[[META145]], [[META146:![0-9]+]], !"vzipq_s32: %agg.result"}
+// CHECK: [[META146]] = distinct !{[[META146]], !"vzipq_s32"}
+// CHECK: [[META147]] = !{[[META148:![0-9]+]]}
+// CHECK: [[META148]] = distinct !{[[META148]], [[META149:![0-9]+]], !"vzipq_u8: %agg.result"}
+// CHECK: [[META149]] = distinct !{[[META149]], !"vzipq_u8"}
+// CHECK: [[META150]] = !{[[META151:![0-9]+]]}
+// CHECK: [[META151]] = distinct !{[[META151]], [[META152:![0-9]+]], !"vzipq_u16: %agg.result"}
+// CHECK: [[META152]] = distinct !{[[META152]], !"vzipq_u16"}
+// CHECK: [[META153]] = !{[[META154:![0-9]+]]}
+// CHECK: [[META154]] = distinct !{[[META154]], [[META155:![0-9]+]], !"vzipq_u32: %agg.result"}
+// CHECK: [[META155]] = distinct !{[[META155]], !"vzipq_u32"}
+// CHECK: [[META156]] = !{[[META157:![0-9]+]]}
+// CHECK: [[META157]] = distinct !{[[META157]], [[META158:![0-9]+]], !"vzipq_f32: %agg.result"}
+// CHECK: [[META158]] = distinct !{[[META158]], !"vzipq_f32"}
+// CHECK: [[META159]] = !{[[META160:![0-9]+]]}
+// CHECK: [[META160]] = distinct !{[[META160]], [[META161:![0-9]+]], !"vzipq_p8: %agg.result"}
+// CHECK: [[META161]] = distinct !{[[META161]], !"vzipq_p8"}
+// CHECK: [[META162]] = !{[[META163:![0-9]+]]}
+// CHECK: [[META163]] = distinct !{[[META163]], [[META164:![0-9]+]], !"vzipq_p16: %agg.result"}
+// CHECK: [[META164]] = distinct !{[[META164]], !"vzipq_p16"}
+//.
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index b108a21dbc52b8..329e8ecfbab0cc 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -557,6 +557,11 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 UTC_AVOID = "NOTE: Do not autogenerate"
 UNUSED_NOTE = "NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:"
 
+DATA_LAYOUT_RE = re.compile(
+    r"target\sdatalayout\s=\s\"(?P<layout>.+)\"$",
+    flags=(re.M | re.S)
+)
+
 OPT_FUNCTION_RE = re.compile(
     r"^(\s*;\s*Function\sAttrs:\s(?P<attrs>[\w\s():,]+?))?\s*define\s+(?P<funcdef_attrs_and_ret>[^@]*)@(?P<func>[\w.$-]+?)\s*"
     r"(?P<args_and_sig>\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P<body>.*?)^\}$",
@@ -650,6 +655,16 @@ def get_triple_from_march(march):
     print("Cannot find a triple. Assume 'x86'", file=sys.stderr)
     return "x86"
 
+def get_global_underscores(raw_tool_output):
+    m = DATA_LAYOUT_RE.search(raw_tool_output)
+    if not m:
+        return False
+    data_layout = m.group("layout")
+    idx = data_layout.find("m:")
+    if idx < 0:
+        return False
+    ch = data_layout[idx + 2]
+    return ch == 'o' or ch == 'x'
 
 def apply_filters(line, filters):
     has_filter = False
diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py
index 3ffb07ddf6ad8f..062971687ab235 100755
--- a/llvm/utils/update_cc_test_checks.py
+++ b/llvm/utils/update_cc_test_checks.py
@@ -34,7 +34,7 @@
 }
 
 
-def get_line2func_list(args, clang_args):
+def get_line2func_list(args, clang_args, global_underscores):
     ret = collections.defaultdict(list)
     # Use clang's JSON AST dump to get the mangled name
     json_dump_args = [args.clang] + clang_args + ["-fsyntax-only", "-o", "-"]
@@ -122,6 +122,11 @@ def parse_clang_ast_json(node, loc, search):
         if search is None:
             search = spell
         mangled = node.get("mangledName", spell)
+        # Strip leading underscore from globals, so the name matches the LLVM one
+        if global_underscores:
+            storage = node.get("storageClass", None)
+            if storage != "static" and mangled[0] == '_':
+                mangled = mangled[1:]
         ret[int(line) - 1].append((spell, mangled, search))
 
     ast = json.loads(stdout)
@@ -249,10 +254,8 @@ def config():
     return args, parser
 
 
-def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes):
+def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes, raw_tool_output):
     # TODO Clean up duplication of asm/common build_function_body_dictionary
-    # Invoke external tool and extract function bodies.
-    raw_tool_output = common.invoke_tool(args.clang, clang_args, filename)
     for extra_command in extra_commands:
         extra_args = shlex.split(extra_command)
         with tempfile.NamedTemporaryFile() as f:
@@ -383,13 +386,15 @@ def main():
             common.debug("Extracted clang cmd: clang {}".format(clang_args))
             common.debug("Extracted FileCheck prefixes: {}".format(prefixes))
 
+            # Invoke external tool and extract function bodies.
+            raw_tool_output = common.invoke_tool(ti.args.clang, clang_args, ti.path)
             get_function_body(
-                builder, ti.args, ti.path, clang_args, extra_commands, prefixes
+                builder, ti.args, ti.path, clang_args, extra_commands, prefixes, raw_tool_output
             )
 
             # Invoke clang -Xclang -ast-dump=json to get mapping from start lines to
             # mangled names. Forward all clang args for now.
-            for k, v in get_line2func_list(ti.args, clang_args).items():
+            for k, v in get_line2func_list(ti.args, clang_args, common.get_global_underscores(raw_tool_output)).items():
                 line2func_list[k].extend(v)
 
         func_dict = builder.finish_and_get_func_dict()

>From 67ba481f809abdc85a90fed5ff11ca48c6578e3e Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 27 Dec 2024 19:29:36 +0000
Subject: [PATCH 02/11] [Arm] Regenerate tests (NFC)

This patch adds `instcombine` to some tests that were passing
Clang's output to `opt -S --passes=mem2reg`, and converts some
other tests to using `update_cc_test_checks.py`.

Assembly part of some tests was also split out and moved to LLVM.

This makes it easier to compare test changes in upcoming patches.
---
 .../CodeGen/AArch64/bf16-dotprod-intrinsics.c |   190 +-
 .../CodeGen/AArch64/bf16-getset-intrinsics.c  |    50 +-
 .../AArch64/bf16-reinterpret-intrinsics.c     |   381 +-
 clang/test/CodeGen/AArch64/neon-2velem.c      |  2953 +-
 clang/test/CodeGen/AArch64/neon-extract.c     |   288 +-
 clang/test/CodeGen/AArch64/neon-fma.c         |   108 +-
 clang/test/CodeGen/AArch64/neon-fp16fml.c     |   906 +-
 .../AArch64/neon-intrinsics-constrained.c     |  1626 +-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 25159 ++++++++++------
 .../CodeGen/AArch64/neon-ldst-one-rcpc3.c     |    98 +-
 clang/test/CodeGen/AArch64/neon-ldst-one.c    | 11123 ++++---
 .../CodeGen/AArch64/neon-misc-constrained.c   |    84 +-
 clang/test/CodeGen/AArch64/neon-misc.c        |  3490 ++-
 clang/test/CodeGen/AArch64/neon-perm.c        |  2505 +-
 .../neon-scalar-x-indexed-elem-constrained.c  |   223 +-
 .../AArch64/neon-scalar-x-indexed-elem.c      |   590 +-
 clang/test/CodeGen/AArch64/poly-add.c         |    37 +-
 clang/test/CodeGen/AArch64/poly128.c          |    66 +-
 clang/test/CodeGen/AArch64/poly64.c           |   856 +-
 .../CodeGen/AArch64/v8.1a-neon-intrinsics.c   |    86 +-
 .../v8.2a-neon-intrinsics-constrained.c       |   566 +-
 .../AArch64/v8.2a-neon-intrinsics-generic.c   |   208 +-
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   |   537 +-
 .../AArch64/v8.5a-neon-frint3264-intrinsic.c  |   147 +-
 .../CodeGen/AArch64/v8.6a-neon-intrinsics.c   |   192 +-
 .../CodeGen/arm-bf16-convert-intrinsics.c     |   390 +-
 .../CodeGen/arm-bf16-dotprod-intrinsics.c     |   192 +-
 .../test/CodeGen/arm-bf16-getset-intrinsics.c |    52 +-
 .../arm-neon-directed-rounding-constrained.c  |    92 +-
 .../test/CodeGen/arm-neon-directed-rounding.c |   233 +-
 clang/test/CodeGen/arm-neon-fma.c             |    40 +-
 clang/test/CodeGen/arm-neon-numeric-maxmin.c  |    18 +-
 clang/test/CodeGen/arm-neon-vcvtX.c           |    34 +-
 clang/test/CodeGen/arm-poly-add.c             |    65 +-
 .../test/CodeGen/arm-v8.1a-neon-intrinsics.c  |   196 +-
 .../arm-v8.2a-neon-intrinsics-generic.c       |   396 +-
 .../test/CodeGen/arm-v8.2a-neon-intrinsics.c  |  1061 +-
 .../test/CodeGen/arm-v8.6a-neon-intrinsics.c  |   110 +-
 clang/test/CodeGen/arm64_vdupq_n_f64.c        |    82 +-
 clang/test/CodeGen/arm_neon_intrinsics.c      | 10919 +++----
 .../CodeGen/AArch64/neon-misc-constrained.ll  |    46 +
 .../AArch64/neon-misc-unconstrained.ll        |    45 +
 .../neon-scalar-x-indexed-elem-constrained.ll |   103 +
 ...eon-scalar-x-indexed-elem-unconstrained.ll |   103 +
 .../v8.2a-neon-intrinsics-constrained.ll      |   276 +
 .../v8.2a-neon-intrinsics-unconstrained.ll    |   265 +
 46 files changed, 35376 insertions(+), 31811 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-misc-constrained.ll
 create mode 100644 llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll
 create mode 100644 llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll
 create mode 100644 llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll
 create mode 100644 llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll
 create mode 100644 llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll

diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
index 877d83c0fa3954..caa803ee794603 100644
--- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
-// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -8,10 +8,7 @@
 
 // CHECK-LABEL: @test_vbfdot_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -20,10 +17,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
 
 // CHECK-LABEL: @test_vbfdotq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -32,19 +26,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 
 // CHECK-LABEL: @test_vbfdot_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_128:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -53,19 +38,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 
 // CHECK-LABEL: @test_vbfdotq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    [[__REINT1_130:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -74,19 +50,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 
 // CHECK-LABEL: @test_vbfdot_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    [[__REINT1_132:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -95,19 +62,10 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 
 // CHECK-LABEL: @test_vbfdotq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_126:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -116,11 +74,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
 
 // CHECK-LABEL: @test_vbfmmlaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_F323_I]]
 //
 float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -129,11 +83,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlalbq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -142,11 +92,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlaltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -155,27 +101,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlalbq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -184,27 +111,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 
 // CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0
-// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2
-// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4
-// CHECK-NEXT:    [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6
-// CHECK-NEXT:    [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -213,27 +121,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
 
 // CHECK-LABEL: @test_vbfmlaltq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -242,27 +131,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 
 // CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0
-// CHECK-NEXT:    [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2
-// CHECK-NEXT:    [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4
-// CHECK-NEXT:    [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6
-// CHECK-NEXT:    [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
diff --git a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
index 9da2cd5af32212..5455850e6d0f0c 100644
--- a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
-// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -17,10 +17,8 @@ bfloat16x4_t test_vcreate_bf16(uint64_t a) {
 
 // CHECK-LABEL: @test_vdup_n_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x bfloat> [[VECINIT_I]], <4 x bfloat> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x bfloat> [[VECINIT3_I]]
 //
 bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) {
@@ -29,14 +27,8 @@ bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) {
 
 // CHECK-LABEL: @test_vdupq_n_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x bfloat> [[VECINIT_I]], <8 x bfloat> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x bfloat> [[VECINIT7_I]]
 //
 bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) {
@@ -45,9 +37,7 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) {
 
 // CHECK-LABEL: @test_vdup_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x bfloat> [[LANE]]
 //
 bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) {
@@ -56,9 +46,7 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vdupq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <8 x bfloat> [[LANE]]
 //
 bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) {
@@ -67,9 +55,7 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vdup_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <4 x bfloat> [[LANE]]
 //
 bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) {
@@ -78,9 +64,7 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vdupq_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x bfloat> [[LANE]]
 //
 bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) {
@@ -98,7 +82,7 @@ bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) {
 
 // CHECK-LABEL: @test_vget_high_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) {
@@ -107,7 +91,7 @@ bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) {
 
 // CHECK-LABEL: @test_vget_low_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) {
@@ -116,7 +100,7 @@ bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) {
 
 // CHECK-LABEL: @test_vget_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) {
@@ -125,7 +109,7 @@ bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vgetq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7
 // CHECK-NEXT:    ret bfloat [[VGETQ_LANE]]
 //
 bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) {
@@ -134,7 +118,7 @@ bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vset_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 1
 // CHECK-NEXT:    ret <4 x bfloat> [[VSET_LANE]]
 //
 bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
@@ -143,7 +127,7 @@ bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vsetq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 7
 // CHECK-NEXT:    ret <8 x bfloat> [[VSET_LANE]]
 //
 bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
@@ -152,7 +136,7 @@ bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vduph_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
@@ -161,7 +145,7 @@ bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vduph_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7
 // CHECK-NEXT:    ret bfloat [[VGETQ_LANE]]
 //
 bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) {
diff --git a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
index 2b271ac88462bc..62d2ca48f3ffea 100644
--- a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
@@ -1,333 +1,388 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vreinterpret_bf16_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s8(int8x8_t a)      { return vreinterpret_bf16_s8(a);    }
-// CHECK-LABEL: @test_vreinterpret_bf16_s16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s16(int16x4_t a)    { return vreinterpret_bf16_s16(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_s32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s32(int32x2_t a)    { return vreinterpret_bf16_s32(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_f32(float32x2_t a)  { return vreinterpret_bf16_f32(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_u8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u8(uint8x8_t a)     { return vreinterpret_bf16_u8(a);    }
-// CHECK-LABEL: @test_vreinterpret_bf16_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u16(uint16x4_t a)   { return vreinterpret_bf16_u16(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_u32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u32(uint32x2_t a)   { return vreinterpret_bf16_u32(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_p8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p8(poly8x8_t a)     { return vreinterpret_bf16_p8(a);    }
-// CHECK-LABEL: @test_vreinterpret_bf16_p16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p16(poly16x4_t a)   { return vreinterpret_bf16_p16(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_u64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u64(uint64x1_t a)   { return vreinterpret_bf16_u64(a);   }
-// CHECK-LABEL: @test_vreinterpret_bf16_s64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s64(int64x1_t a)    { return vreinterpret_bf16_s64(a);   }
-// CHECK-LABEL: @test_vreinterpretq_bf16_s8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s8(int8x16_t a)    { return vreinterpretq_bf16_s8(a);   }
-// CHECK-LABEL: @test_vreinterpretq_bf16_s16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s16(int16x8_t a)   { return vreinterpretq_bf16_s16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_s32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s32(int32x4_t a)   { return vreinterpretq_bf16_s32(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_f32(float32x4_t a) { return vreinterpretq_bf16_f32(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_u8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u8(uint8x16_t a)   { return vreinterpretq_bf16_u8(a);   }
-// CHECK-LABEL: @test_vreinterpretq_bf16_u16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u16(uint16x8_t a)  { return vreinterpretq_bf16_u16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_u32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u32(uint32x4_t a)  { return vreinterpretq_bf16_u32(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_p8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p8(poly8x16_t a)   { return vreinterpretq_bf16_p8(a);   }
-// CHECK-LABEL: @test_vreinterpretq_bf16_p16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p16(poly16x8_t a)  { return vreinterpretq_bf16_p16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_u64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u64(uint64x2_t a)  { return vreinterpretq_bf16_u64(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_s64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s64(int64x2_t a)   { return vreinterpretq_bf16_s64(a);  }
-// CHECK-LABEL: @test_vreinterpret_bf16_p64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p64(poly64x1_t a)   { return vreinterpret_bf16_p64(a);   }
-// CHECK-LABEL: @test_vreinterpretq_bf16_p64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p64(poly64x2_t a)  { return vreinterpretq_bf16_p64(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_p128(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p128(
+// CHECK-SAME: i128 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p128(poly128_t a)  { return vreinterpretq_bf16_p128(a); }
-// CHECK-LABEL: @test_vreinterpret_bf16_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <4 x bfloat>
+// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x bfloat>
 // CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
 //
 bfloat16x4_t test_vreinterpret_bf16_f64(float64x1_t a)  { return vreinterpret_bf16_f64(a);  }
-// CHECK-LABEL: @test_vreinterpretq_bf16_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <8 x bfloat>
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x bfloat>
 // CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_f64(float64x2_t a) { return vreinterpretq_bf16_f64(a); }
-// CHECK-LABEL: @test_vreinterpret_s8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[TMP0]]
 //
 int8x8_t    test_vreinterpret_s8_bf16(bfloat16x4_t a)    { return vreinterpret_s8_bf16(a);    }
-// CHECK-LABEL: @test_vreinterpret_s16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 int16x4_t   test_vreinterpret_s16_bf16(bfloat16x4_t a)   { return vreinterpret_s16_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_s32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32>
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[TMP0]]
 //
 int32x2_t   test_vreinterpret_s32_bf16(bfloat16x4_t a)   { return vreinterpret_s32_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_f32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x float>
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x float>
 // CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vreinterpret_f32_bf16(bfloat16x4_t a)   { return vreinterpret_f32_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_u8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[TMP0]]
 //
 uint8x8_t   test_vreinterpret_u8_bf16(bfloat16x4_t a)    { return vreinterpret_u8_bf16(a);    }
-// CHECK-LABEL: @test_vreinterpret_u16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 uint16x4_t  test_vreinterpret_u16_bf16(bfloat16x4_t a)   { return vreinterpret_u16_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_u32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32>
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[TMP0]]
 //
 uint32x2_t  test_vreinterpret_u32_bf16(bfloat16x4_t a)   { return vreinterpret_u32_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_p8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[TMP0]]
 //
 poly8x8_t   test_vreinterpret_p8_bf16(bfloat16x4_t a)    { return vreinterpret_p8_bf16(a);    }
-// CHECK-LABEL: @test_vreinterpret_p16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 poly16x4_t  test_vreinterpret_p16_bf16(bfloat16x4_t a)   { return vreinterpret_p16_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_u64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64>
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
 // CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 uint64x1_t  test_vreinterpret_u64_bf16(bfloat16x4_t a)   { return vreinterpret_u64_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_s64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64>
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
 // CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 int64x1_t   test_vreinterpret_s64_bf16(bfloat16x4_t a)   { return vreinterpret_s64_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpret_p64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64>
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
 // CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 poly64x1_t  test_vreinterpret_p64_bf16(bfloat16x4_t a)   { return vreinterpret_p64_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpretq_s8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t   test_vreinterpretq_s8_bf16(bfloat16x8_t a)   { return vreinterpretq_s8_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpretq_s16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t   test_vreinterpretq_s16_bf16(bfloat16x8_t a)  { return vreinterpretq_s16_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_s32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32>
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t   test_vreinterpretq_s32_bf16(bfloat16x8_t a)  { return vreinterpretq_s32_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_f32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x float>
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vreinterpretq_f32_bf16(bfloat16x8_t a)  { return vreinterpretq_f32_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_u8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t  test_vreinterpretq_u8_bf16(bfloat16x8_t a)   { return vreinterpretq_u8_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpretq_u16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t  test_vreinterpretq_u16_bf16(bfloat16x8_t a)  { return vreinterpretq_u16_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_u32_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32>
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t  test_vreinterpretq_u32_bf16(bfloat16x8_t a)  { return vreinterpretq_u32_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_p8_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 poly8x16_t  test_vreinterpretq_p8_bf16(bfloat16x8_t a)   { return vreinterpretq_p8_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpretq_p16_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16>
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 poly16x8_t  test_vreinterpretq_p16_bf16(bfloat16x8_t a)  { return vreinterpretq_p16_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_u64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 uint64x2_t  test_vreinterpretq_u64_bf16(bfloat16x8_t a)  { return vreinterpretq_u64_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_s64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 int64x2_t   test_vreinterpretq_s64_bf16(bfloat16x8_t a)  { return vreinterpretq_s64_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_p64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64>
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 poly64x2_t  test_vreinterpretq_p64_bf16(bfloat16x8_t a)  { return vreinterpretq_p64_bf16(a);  }
-// CHECK-LABEL: @test_vreinterpretq_p128_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to i128
+// CHECK-LABEL: define dso_local i128 @test_vreinterpretq_p128_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
 //
 poly128_t   test_vreinterpretq_p128_bf16(bfloat16x8_t a) { return vreinterpretq_p128_bf16(a); }
-// CHECK-LABEL: @test_vreinterpret_f64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x double>
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_bf16(
+// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x double>
 // CHECK-NEXT:    ret <1 x double> [[TMP0]]
 //
 float64x1_t test_vreinterpret_f64_bf16(bfloat16x4_t a)   { return vreinterpret_f64_bf16(a);   }
-// CHECK-LABEL: @test_vreinterpretq_f64_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x double>
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_bf16(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x double>
 // CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vreinterpretq_f64_bf16(bfloat16x8_t a)  { return vreinterpretq_f64_bf16(a);  }
diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c
index 75bdeb92fd9ca6..1d28b48f29bf7a 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -7,9 +7,7 @@
 
 // CHECK-LABEL: @test_vmla_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -20,9 +18,7 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -33,9 +29,7 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -46,9 +40,7 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -59,9 +51,7 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -72,9 +62,7 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -85,9 +73,7 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -98,9 +84,7 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -111,9 +95,7 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -124,9 +106,7 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -137,9 +117,7 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -150,9 +128,7 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -163,9 +139,7 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -176,9 +150,7 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -189,9 +161,7 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -202,9 +172,7 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -215,9 +183,7 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -227,9 +193,7 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -239,9 +203,7 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -251,9 +213,7 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -263,9 +223,7 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -275,9 +233,7 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -287,9 +243,7 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -299,9 +253,7 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -311,9 +263,7 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -323,9 +273,7 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -335,9 +283,7 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -347,9 +293,7 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -359,9 +303,7 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -371,9 +313,7 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -383,9 +323,7 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -395,9 +333,7 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -407,14 +343,8 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vfma_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[LANE]], <2 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 //
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
@@ -423,14 +353,8 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 
 // CHECK-LABEL: @test_vfmaq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[LANE]], <4 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 //
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
@@ -439,15 +363,9 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 
 // CHECK-LABEL: @test_vfma_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
-// CHECK-NEXT:    ret <2 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B:%.*]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfma_laneq_f32(a, b, v, 3);
@@ -455,15 +373,9 @@ float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 
 // CHECK-LABEL: @test_vfmaq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
-// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmaq_laneq_f32(a, b, v, 3);
@@ -472,14 +384,8 @@ float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG]], <2 x float> [[LANE]], <2 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 //
 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
@@ -489,14 +395,8 @@ float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: @test_vfmsq_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG]], <4 x float> [[LANE]], <4 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 //
 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
@@ -506,15 +406,9 @@ float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: @test_vfms_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
-// CHECK-NEXT:    ret <2 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[FNEG]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfms_laneq_f32(a, b, v, 3);
@@ -523,15 +417,9 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: @test_vfmsq_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
-// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[FNEG]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
@@ -539,14 +427,8 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 
 // CHECK-LABEL: @test_vfmaq_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[LANE]], <2 x double> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
 //
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
@@ -555,15 +437,9 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 
 // CHECK-LABEL: @test_vfmaq_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
-// CHECK-NEXT:    ret <2 x double> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B:%.*]], <2 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmaq_laneq_f64(a, b, v, 1);
@@ -572,14 +448,8 @@ float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 // CHECK-LABEL: @test_vfmsq_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG]], <2 x double> [[LANE]], <2 x double> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x double> [[FMLA2]]
 //
 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
@@ -589,15 +459,9 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 // CHECK-LABEL: @test_vfmsq_laneq_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
-// CHECK-NEXT:    ret <2 x double> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[FNEG]], <2 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
@@ -605,7 +469,7 @@ float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 
 // CHECK-LABEL: @test_vfmas_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i64 3
 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -616,7 +480,7 @@ float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 // CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i64 0
 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
 // CHECK-NEXT:    ret double [[TMP0]]
 //
@@ -627,7 +491,7 @@ float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
 // CHECK-LABEL: @test_vfmss_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B:%.*]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i64 3
 // CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -638,7 +502,7 @@ float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 // CHECK-LABEL: @test_vfmsd_laneq_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 1
 // CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
 // CHECK-NEXT:    ret double [[TMP0]]
 //
@@ -648,12 +512,8 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -663,12 +523,8 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -678,12 +534,8 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -693,12 +545,8 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -708,12 +556,8 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -724,12 +568,8 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -740,12 +580,8 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -756,12 +592,8 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -772,12 +604,8 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -787,12 +615,8 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -802,12 +626,8 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -817,12 +637,8 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -832,12 +648,8 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -848,12 +660,8 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -864,12 +672,8 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -880,12 +684,8 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -896,12 +696,8 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -911,12 +707,8 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -926,12 +718,8 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -941,12 +729,8 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -956,12 +740,8 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -972,12 +752,8 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -988,12 +764,8 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -1004,12 +776,8 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -1020,12 +788,8 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1035,12 +799,8 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1050,12 +810,8 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -1065,12 +821,8 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -1080,12 +832,8 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -1096,12 +844,8 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -1112,12 +856,8 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -1128,12 +868,8 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -1144,12 +880,8 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1158,12 +890,8 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1172,12 +900,8 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
@@ -1186,12 +910,8 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
@@ -1200,12 +920,8 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -1215,12 +931,8 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -1230,12 +942,8 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -1245,12 +953,8 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -1260,12 +964,8 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -1274,12 +974,8 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -1288,12 +984,8 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
@@ -1302,12 +994,8 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
@@ -1316,12 +1004,8 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -1331,12 +1015,8 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -1346,12 +1026,8 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -1361,12 +1037,8 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -1376,14 +1048,9 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1392,14 +1059,9 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1408,15 +1070,10 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1425,15 +1082,10 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1442,14 +1094,9 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1458,14 +1105,9 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1474,15 +1116,10 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1491,15 +1128,10 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1508,13 +1140,8 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1523,13 +1150,8 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1538,13 +1160,8 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -1553,13 +1170,8 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -1568,14 +1180,9 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1584,14 +1191,9 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1600,14 +1202,9 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -1616,14 +1213,9 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -1632,11 +1224,7 @@ int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
 //
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1645,11 +1233,7 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1658,11 +1242,7 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
 //
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1671,11 +1251,7 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1684,11 +1260,7 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
 //
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1697,11 +1269,7 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1710,11 +1278,7 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
 //
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1723,11 +1287,7 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1736,9 +1296,7 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
@@ -1749,14 +1307,11 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
-// CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP2]]
 //
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
   return vmul_lane_f64(a, v, 0);
@@ -1764,9 +1319,7 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
@@ -1776,9 +1329,7 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
@@ -1788,9 +1339,7 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
@@ -1800,14 +1349,11 @@ float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-// CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 1
+// CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP2]]
 //
 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
   return vmul_laneq_f64(a, v, 1);
@@ -1815,9 +1361,7 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
@@ -1827,9 +1371,7 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
@@ -1839,12 +1381,8 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulx_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
@@ -1853,12 +1391,8 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
@@ -1867,12 +1401,8 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]])
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
@@ -1881,12 +1411,8 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmulx_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
@@ -1895,12 +1421,8 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
@@ -1909,12 +1431,8 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]])
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
@@ -1923,9 +1441,7 @@ float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -1936,9 +1452,7 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -1949,9 +1463,7 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -1962,9 +1474,7 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -1975,9 +1485,7 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -1988,9 +1496,7 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -2001,9 +1507,7 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -2014,9 +1518,7 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -2027,9 +1529,7 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -2040,9 +1540,7 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -2053,9 +1551,7 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -2066,9 +1562,7 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2079,9 +1573,7 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -2092,9 +1584,7 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -2105,9 +1595,7 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -2118,9 +1606,7 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2131,9 +1617,7 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -2143,9 +1627,7 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -2155,9 +1637,7 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -2167,9 +1647,7 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -2179,9 +1657,7 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -2191,9 +1667,7 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -2203,9 +1677,7 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -2215,9 +1687,7 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -2227,9 +1697,7 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -2239,9 +1707,7 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -2251,9 +1717,7 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -2263,9 +1727,7 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -2275,9 +1737,7 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -2287,9 +1747,7 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -2299,9 +1757,7 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -2311,9 +1767,7 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -2323,14 +1777,8 @@ uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vfma_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[LANE]], <2 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 //
 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
@@ -2339,14 +1787,8 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
 
 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[LANE]], <4 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 //
 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
@@ -2355,15 +1797,9 @@ float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
 
 // CHECK-LABEL: @test_vfma_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
-// CHECK-NEXT:    ret <2 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B:%.*]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfma_laneq_f32(a, b, v, 0);
@@ -2371,15 +1807,9 @@ float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
 
 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
-// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmaq_laneq_f32(a, b, v, 0);
@@ -2388,14 +1818,8 @@ float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
 // CHECK-LABEL: @test_vfms_lane_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG]], <2 x float> [[LANE]], <2 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[FMLA2]]
 //
 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
@@ -2405,14 +1829,8 @@ float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG]], <4 x float> [[LANE]], <4 x float> [[A:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[FMLA2]]
 //
 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
@@ -2422,15 +1840,9 @@ float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: @test_vfms_laneq_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
-// CHECK-NEXT:    ret <2 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[FNEG]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vfms_laneq_f32(a, b, v, 0);
@@ -2439,15 +1851,9 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
-// CHECK-NEXT:    ret <4 x float> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[FNEG]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 0);
@@ -2455,15 +1861,9 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
 
 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
-// CHECK-NEXT:    ret <2 x double> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B:%.*]], <2 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmaq_laneq_f64(a, b, v, 0);
@@ -2472,15 +1872,9 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
-// CHECK-NEXT:    ret <2 x double> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[FNEG]], <2 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 0);
@@ -2488,12 +1882,8 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
 
 // CHECK-LABEL: @test_vmlal_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2503,12 +1893,8 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2518,12 +1904,8 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2533,12 +1915,8 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2548,12 +1926,8 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -2564,12 +1938,8 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -2580,12 +1950,8 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -2596,12 +1962,8 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -2612,12 +1974,8 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2627,12 +1985,8 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2642,12 +1996,8 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2657,12 +2007,8 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2672,12 +2018,8 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2688,12 +2030,8 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -2704,12 +2042,8 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2720,12 +2054,8 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -2736,12 +2066,8 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2751,12 +2077,8 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2766,12 +2088,8 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
 //
@@ -2781,12 +2099,8 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
 //
@@ -2796,12 +2110,8 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -2812,12 +2122,8 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -2828,12 +2134,8 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -2844,12 +2146,8 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -2860,12 +2158,8 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2875,12 +2169,8 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2890,12 +2180,8 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
 //
@@ -2905,12 +2191,8 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
 //
@@ -2920,12 +2202,8 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2936,12 +2214,8 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -2952,12 +2226,8 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -2968,12 +2238,8 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -2984,12 +2250,8 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -2998,12 +2260,8 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3012,12 +2270,8 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
@@ -3026,12 +2280,8 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
@@ -3040,12 +2290,8 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -3055,12 +2301,8 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -3070,12 +2312,8 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -3085,12 +2323,8 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -3100,12 +2334,8 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -3114,12 +2344,8 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -3128,12 +2354,8 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
@@ -3142,12 +2364,8 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
@@ -3156,12 +2374,8 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -3171,12 +2385,8 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -3186,12 +2396,8 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -3201,12 +2407,8 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -3216,14 +2418,9 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3232,14 +2429,9 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3248,15 +2440,10 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3265,15 +2452,10 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3282,14 +2464,9 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3298,14 +2475,9 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3314,15 +2486,10 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3331,15 +2498,10 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3348,13 +2510,8 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -3363,13 +2520,8 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3378,13 +2530,8 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -3393,13 +2540,8 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -3408,14 +2550,9 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -3424,14 +2561,9 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -3440,14 +2572,9 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -3456,14 +2583,9 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -3472,11 +2594,7 @@ int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANE_V2]]
 //
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -3485,11 +2603,7 @@ int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -3498,11 +2612,7 @@ int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANE_V2]]
 //
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3511,11 +2621,7 @@ int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -3524,11 +2630,7 @@ int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANE_V2]]
 //
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -3537,11 +2639,7 @@ int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -3550,11 +2648,7 @@ int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANE_V2]]
 //
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3563,11 +2657,7 @@ int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -3576,9 +2666,7 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
@@ -3588,9 +2676,7 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
@@ -3600,9 +2686,7 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
@@ -3612,14 +2696,11 @@ float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmul_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-// CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP2]]
 //
 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
   return vmul_laneq_f64(a, v, 0);
@@ -3627,9 +2708,7 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
@@ -3639,9 +2718,7 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
 // CHECK-NEXT:    ret <2 x double> [[MUL]]
 //
@@ -3651,12 +2728,8 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmulx_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
@@ -3665,12 +2738,8 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
@@ -3679,12 +2748,8 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
 
 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]])
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
@@ -3693,12 +2758,8 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
 
 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]])
 // CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
 //
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
@@ -3707,12 +2768,8 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]])
 // CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
 //
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
@@ -3721,12 +2778,8 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
 
 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]])
 // CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
 //
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
@@ -3735,13 +2788,9 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
 
 // CHECK-LABEL: @test_vmull_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
@@ -3751,11 +2800,9 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vmull_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
@@ -3765,13 +2812,9 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmull_high_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
@@ -3781,11 +2824,9 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
 
 // CHECK-LABEL: @test_vmull_high_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
@@ -3795,15 +2836,10 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
 
 // CHECK-LABEL: @test_vqdmull_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
 //
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
@@ -3812,13 +2848,10 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqdmull_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
 //
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
@@ -3827,13 +2860,9 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmlal_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -3844,11 +2873,9 @@ int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlal_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -3859,13 +2886,9 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlal_high_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -3876,11 +2899,9 @@ uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlal_high_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -3891,16 +2912,11 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
 //
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
@@ -3909,14 +2925,11 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
 //
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
@@ -3925,13 +2938,9 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlsl_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -3942,11 +2951,9 @@ int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlsl_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -3957,13 +2964,9 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlsl_high_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -3974,11 +2977,9 @@ uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlsl_high_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -3989,16 +2990,11 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
 //
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
@@ -4007,14 +3003,11 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
 //
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
@@ -4023,8 +3016,8 @@ int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmul_n_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x float> [[MUL_I]]
 //
@@ -4034,10 +3027,8 @@ float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x float> [[MUL_I]]
 //
@@ -4047,8 +3038,8 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x double> [[MUL_I]]
 //
@@ -4058,13 +3049,10 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
 
 // CHECK-LABEL: @test_vfma_n_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[VECINIT1_I]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfma_n_f32(a, b, n);
@@ -4072,12 +3060,9 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 
 // CHECK-LABEL: @test_vfma_n_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
-// CHECK-NEXT:    ret <1 x double> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B:%.*]], <1 x double> [[VECINIT_I]], <1 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
 //
 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
   return vfma_n_f64(a, b, n);
@@ -4085,15 +3070,10 @@ float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
 
 // CHECK-LABEL: @test_vfmaq_n_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[VECINIT3_I]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmaq_n_f32(a, b, n);
@@ -4102,13 +3082,10 @@ float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
 // CHECK-LABEL: @test_vfms_n_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfms_n_f32(a, b, n);
@@ -4117,12 +3094,9 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK-LABEL: @test_vfms_n_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
-// CHECK-NEXT:    ret <1 x double> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A:%.*]])
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
 //
 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
   return vfms_n_f64(a, b, n);
@@ -4131,15 +3105,10 @@ float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
 // CHECK-LABEL: @test_vfmsq_n_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmsq_n_f32(a, b, n);
@@ -4147,10 +3116,8 @@ float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
 
 // CHECK-LABEL: @test_vmul_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
 //
@@ -4160,14 +3127,8 @@ int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
 //
@@ -4177,8 +3138,8 @@ int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vmul_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
 //
@@ -4188,10 +3149,8 @@ int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
 //
@@ -4201,10 +3160,8 @@ int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmul_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
 //
@@ -4214,14 +3171,8 @@ uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
 //
@@ -4231,8 +3182,8 @@ uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
 
 // CHECK-LABEL: @test_vmul_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
 //
@@ -4242,10 +3193,8 @@ uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
 
 // CHECK-LABEL: @test_vmulq_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
 //
@@ -4255,13 +3204,9 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
 
 // CHECK-LABEL: @test_vmull_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
@@ -4270,11 +3215,9 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vmull_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
@@ -4283,13 +3226,9 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmull_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
@@ -4298,11 +3237,9 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
 
 // CHECK-LABEL: @test_vmull_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
@@ -4311,14 +3248,9 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
 
 // CHECK-LABEL: @test_vqdmull_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
 //
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
@@ -4327,12 +3259,9 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqdmull_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
 //
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
@@ -4341,14 +3270,9 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vqdmulh_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
 //
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
@@ -4357,18 +3281,9 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqdmulhq_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VECINIT7_I]])
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
 //
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
@@ -4377,12 +3292,9 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqdmulh_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
 //
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
@@ -4391,14 +3303,9 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vqdmulhq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
 //
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
@@ -4407,14 +3314,9 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vqrdmulh_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
 //
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
@@ -4423,18 +3325,9 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VECINIT7_I]])
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
 //
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
@@ -4443,12 +3336,9 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
 
 // CHECK-LABEL: @test_vqrdmulh_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
 //
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
@@ -4457,14 +3347,9 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
 //
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
@@ -4473,10 +3358,8 @@ int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
 
 // CHECK-LABEL: @test_vmla_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
@@ -4487,14 +3370,8 @@ int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlaq_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
@@ -4505,8 +3382,8 @@ int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmla_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
@@ -4517,10 +3394,8 @@ int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlaq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -4531,10 +3406,8 @@ int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmla_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
@@ -4545,14 +3418,8 @@ uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlaq_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
@@ -4563,8 +3430,8 @@ uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmla_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
@@ -4575,10 +3442,8 @@ uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vmlaq_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -4589,13 +3454,9 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vmlal_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -4605,11 +3466,9 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlal_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
@@ -4619,13 +3478,9 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlal_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -4635,11 +3490,9 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlal_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
@@ -4649,15 +3502,10 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vqdmlal_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
 //
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4666,13 +3514,10 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vqdmlal_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
 //
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4681,10 +3526,8 @@ int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmls_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
@@ -4695,14 +3538,8 @@ int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlsq_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
@@ -4713,8 +3550,8 @@ int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmls_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
@@ -4725,10 +3562,8 @@ int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlsq_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -4739,10 +3574,8 @@ int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmls_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
@@ -4753,14 +3586,8 @@ uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlsq_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
@@ -4771,8 +3598,8 @@ uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmls_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
@@ -4783,10 +3610,8 @@ uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vmlsq_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -4797,13 +3622,9 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vmlsl_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
@@ -4813,11 +3634,9 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vmlsl_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
@@ -4827,13 +3646,9 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmlsl_n_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
@@ -4843,11 +3658,9 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 
 // CHECK-LABEL: @test_vmlsl_n_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
@@ -4857,15 +3670,10 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 
 // CHECK-LABEL: @test_vqdmlsl_n_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
 //
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4874,13 +3682,10 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 
 // CHECK-LABEL: @test_vqdmlsl_n_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
 //
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4889,9 +3694,7 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vmla_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -4902,9 +3705,7 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -4915,9 +3716,7 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -4928,9 +3727,7 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -4941,9 +3738,7 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -4954,9 +3749,7 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -4967,9 +3760,7 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -4980,9 +3771,7 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -4993,14 +3782,9 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5009,14 +3793,9 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5025,15 +3804,10 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5042,15 +3816,10 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5059,9 +3828,7 @@ int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -5072,9 +3839,7 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -5085,9 +3850,7 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -5098,9 +3861,7 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -5111,9 +3872,7 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -5124,9 +3883,7 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -5137,9 +3894,7 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -5150,9 +3905,7 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -5163,14 +3916,9 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5179,14 +3927,9 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5195,15 +3938,10 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5212,15 +3950,10 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5229,11 +3962,7 @@ int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -5242,11 +3971,7 @@ int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -5255,11 +3980,7 @@ int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -5268,11 +3989,7 @@ int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -5281,11 +3998,7 @@ int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -5294,11 +4007,7 @@ int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -5307,11 +4016,7 @@ int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -5320,11 +4025,7 @@ int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0)
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -5333,9 +4034,7 @@ int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -5346,9 +4045,7 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -5359,9 +4056,7 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmla_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -5372,9 +4067,7 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlaq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -5385,9 +4078,7 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -5398,9 +4089,7 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -5411,9 +4100,7 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmla_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -5424,9 +4111,7 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlaq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -5437,14 +4122,9 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5453,14 +4133,9 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5469,15 +4144,10 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
 //
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5486,15 +4156,10 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
 //
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5503,9 +4168,7 @@ int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -5516,9 +4179,7 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -5529,9 +4190,7 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
 
 // CHECK-LABEL: @test_vmls_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -5542,9 +4201,7 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmlsq_lane_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -5555,9 +4212,7 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -5568,9 +4223,7 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -5581,9 +4234,7 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
 
 // CHECK-LABEL: @test_vmls_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -5594,9 +4245,7 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vmlsq_laneq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -5607,14 +4256,9 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5623,14 +4267,9 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5639,15 +4278,10 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
 //
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5656,15 +4290,10 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
 //
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5673,11 +4302,7 @@ int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7)
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -5686,11 +4311,7 @@ int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7)
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -5699,11 +4320,7 @@ int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -5712,11 +4329,7 @@ int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -5725,11 +4338,7 @@ int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7)
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -5738,11 +4347,7 @@ int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7)
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -5751,11 +4356,7 @@ int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -5764,11 +4365,7 @@ int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3)
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
diff --git a/clang/test/CodeGen/AArch64/neon-extract.c b/clang/test/CodeGen/AArch64/neon-extract.c
index e5699f813131fa..61312cfdc02b78 100644
--- a/clang/test/CodeGen/AArch64/neon-extract.c
+++ b/clang/test/CodeGen/AArch64/neon-extract.c
@@ -1,246 +1,244 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_s8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vext_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_s16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vext_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_s32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i32> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vext_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[VEXT]]
+//
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_s64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vext_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_s8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_s16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_s32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-// CHECK:   ret <4 x i32> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK-NEXT:    ret <4 x i32> [[VEXT]]
+//
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_s64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VEXT]]
+//
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_u8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vext_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_u16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vext_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_u32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i32> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vext_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[VEXT]]
+//
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_u64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vext_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_u8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_u16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_u32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-// CHECK:   ret <4 x i32> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK-NEXT:    ret <4 x i32> [[VEXT]]
+//
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_u64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VEXT]]
+//
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vext_f32(<2 x float> noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x float> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vext_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x float> [[VEXT]]
+//
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vext_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x double> [[VEXT]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vext_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x double> [[A]]
+//
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
   return vext_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vextq_f32(<4 x float> noundef %a, <4 x float> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-// CHECK:   ret <4 x float> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vextq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK-NEXT:    ret <4 x float> [[VEXT]]
+//
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vextq_f64(<2 x double> noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x double> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vextq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x double> [[VEXT]]
+//
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
   return vextq_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_p8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-// CHECK:   ret <8 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vext_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK-NEXT:    ret <8 x i8> [[VEXT]]
+//
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_p16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-// CHECK:   ret <4 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vext_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[VEXT]]
+//
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_p8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
-// CHECK:   ret <16 x i8> [[VEXT]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK-NEXT:    ret <16 x i8> [[VEXT]]
+//
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 2);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_p16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-// CHECK:   ret <8 x i16> [[VEXT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK-NEXT:    ret <8 x i16> [[VEXT]]
+//
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 3);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-fma.c b/clang/test/CodeGen/AArch64/neon-fma.c
index b87c531b8b231c..37c6aa88683056 100644
--- a/clang/test/CodeGen/AArch64/neon-fma.c
+++ b/clang/test/CodeGen/AArch64/neon-fma.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -8,8 +8,8 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vmla_n_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x float> [[ADD_I]]
@@ -21,10 +21,8 @@ float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_n_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x float> [[ADD_I]]
@@ -36,10 +34,8 @@ float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_n_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x float> [[SUB_I]]
@@ -51,8 +47,8 @@ float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmls_n_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x float> [[SUB_I]]
@@ -64,9 +60,7 @@ float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32_0
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[ADD]]
@@ -78,9 +72,7 @@ float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32_0
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
@@ -92,9 +84,7 @@ float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32_0
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[ADD]]
@@ -106,9 +96,7 @@ float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32_0
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
@@ -120,9 +108,7 @@ float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
 // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32_0
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[SUB]]
@@ -134,9 +120,7 @@ float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32_0
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[SUB]]
@@ -148,9 +132,7 @@ float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32_0
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[SUB]]
@@ -162,9 +144,7 @@ float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32_0
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[SUB]]
@@ -176,9 +156,7 @@ float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v)
 // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[ADD]]
@@ -190,9 +168,7 @@ float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
@@ -204,9 +180,7 @@ float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[ADD]]
@@ -218,9 +192,7 @@ float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
@@ -232,9 +204,7 @@ float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[SUB]]
@@ -246,9 +216,7 @@ float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[SUB]]
@@ -259,9 +227,7 @@ float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[SUB]]
@@ -273,9 +239,7 @@ float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[SUB]]
@@ -287,13 +251,10 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f64
 // CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
-// CHECK-NEXT:    ret <2 x double> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmaq_n_f64(a, b, c);
@@ -303,13 +264,10 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
 // CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[B]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
-// CHECK-NEXT:    ret <2 x double> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
 //
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmsq_n_f64(a, b, c);
diff --git a/clang/test/CodeGen/AArch64/neon-fp16fml.c b/clang/test/CodeGen/AArch64/neon-fp16fml.c
index 976045d6e79f3d..d7687ddca2888c 100644
--- a/clang/test/CodeGen/AArch64/neon-fp16fml.c
+++ b/clang/test/CodeGen/AArch64/neon-fp16fml.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -12,10 +12,7 @@
 
 // CHECK-LABEL: @test_vfmlal_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -24,10 +21,7 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 
 // CHECK-LABEL: @test_vfmlsl_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -36,10 +30,7 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 
 // CHECK-LABEL: @test_vfmlal_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -48,10 +39,7 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 
 // CHECK-LABEL: @test_vfmlsl_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -60,10 +48,7 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 
 // CHECK-LABEL: @test_vfmlalq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -72,10 +57,7 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 
 // CHECK-LABEL: @test_vfmlslq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -84,10 +66,7 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 
 // CHECK-LABEL: @test_vfmlalq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -96,10 +75,7 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 
 // CHECK-LABEL: @test_vfmlslq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -110,42 +86,8 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 
 // CHECK-LABEL: @test_vfmlal_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -154,42 +96,8 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 
 // CHECK-LABEL: @test_vfmlal_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -198,74 +106,8 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlalq_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84734:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84735:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84744:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84745:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84754:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84755:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84764:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84765:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -274,74 +116,8 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlalq_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84734:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84735:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84744:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84745:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84754:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84755:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84764:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84765:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -350,42 +126,8 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlal_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -394,42 +136,8 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlal_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -438,74 +146,8 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlalq_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85034:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85035:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85044:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85045:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85054:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85055:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85064:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85065:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -514,74 +156,8 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlalq_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85034:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85035:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85044:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85045:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85054:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85055:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85064:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85065:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -590,42 +166,8 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_
 
 // CHECK-LABEL: @test_vfmlsl_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -634,42 +176,8 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 
 // CHECK-LABEL: @test_vfmlsl_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -678,74 +186,8 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlslq_lane_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84734:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84735:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84744:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84745:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84754:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84755:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84764:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84765:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -754,74 +196,8 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlslq_lane_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8474:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_8475:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84714:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84715:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84724:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84725:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84734:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84735:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84744:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84745:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84754:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84755:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_84764:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_84765:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8
-// CHECK-NEXT:    [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8
-// CHECK-NEXT:    [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8
-// CHECK-NEXT:    [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <4 x half> [[C]], ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8
-// CHECK-NEXT:    [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -830,42 +206,8 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 
 // CHECK-LABEL: @test_vfmlsl_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -874,42 +216,8 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlsl_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
+// CHECK-NEXT:    [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -918,74 +226,8 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlslq_laneq_low_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85034:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85035:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85044:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85045:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85054:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85055:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85064:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85065:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -994,74 +236,8 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 
 // CHECK-LABEL: @test_vfmlslq_laneq_high_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_8504:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_8505:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85014:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85015:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85024:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85025:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85034:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85035:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85044:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85045:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85054:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85055:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[__REINT_85064:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_85065:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2
-// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2
-// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2
-// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2
-// CHECK-NEXT:    [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2
-// CHECK-NEXT:    [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2
-// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6
-// CHECK-NEXT:    store <8 x half> [[C]], ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2
-// CHECK-NEXT:    [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7
-// CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
+// CHECK-NEXT:    [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
index 15ae7eea820e80..d8300a0df6d322 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
@@ -1,12 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:     -disable-O0-optnone \
-// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s
+// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefixes=UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:     -disable-O0-optnone \
 // RUN:  -ffp-exception-behavior=strict \
-// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,CONSTRAINED %s
+// RUN:  -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefixes=CONSTRAINED %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -14,804 +15,1523 @@
 
 #include <arm_neon.h>
 
-// COMMON-LABEL: test_vadd_f32
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
-// CONSTRAINED:   [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3:[0-9]+]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
   return vadd_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vaddq_f32
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
-// CONSTRAINED:   [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
   return vaddq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vsub_f32
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
-// CONSTRAINED:   [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
   return vsub_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vsubq_f32
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
-// CONSTRAINED:   [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
   return vsubq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vsubq_f64
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
-// CONSTRAINED:   [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[SUB_I]]
+//
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
   return vsubq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vmul_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[MUL_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[MUL_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[MUL_I]]
+//
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
   return vmul_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vmulq_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
-// CONSTRAINED:   [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[MUL_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[MUL_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[MUL_I]]
+//
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
   return vmulq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vmulq_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[MUL_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[MUL_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[MUL_I]]
+//
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
   return vmulq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vmla_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
-// CONSTRAINED:   [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmla_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vmlaq_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
-// CONSTRAINED:   [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlaq_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vmlaq_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
-// CONSTRAINED:   [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[ADD_I]]
+//
 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlaq_f64(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vmls_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
-// CONSTRAINED:   [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmls_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vmlsq_f32
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
-// CONSTRAINED:   [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:   ret <4 x float> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlsq_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vmlsq_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
-// CONSTRAINED:   [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
-// CONSTRAINED:   [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[SUB_I]]
+//
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlsq_f64(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfma_f32
-// COMMONIR:      [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfma_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfmaq_f32
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmaq_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfmaq_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmaq_f64(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfms_f32
-// COMMONIR:      [[SUB_I:%.*]] = fneg <2 x float> %v2
-// COMMONIR:      [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[V2]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[V2]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfms_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfmsq_f32
-// COMMONIR:      [[SUB_I:%.*]] = fneg <4 x float> %v2
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[V2]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[V2]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmsq_f32(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vfmsq_f64
-// COMMONIR:      [[SUB_I:%.*]] = fneg <2 x double> %v2
-// COMMONIR:      [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1)
-// CONSTRAINED:   [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[V2]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]])
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[V2]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmsq_f64(v1, v2, v3);
 }
 
-// COMMON-LABEL: test_vdivq_f64
-// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
-// CONSTRAINED:   [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x double> [[DIV_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[DIV_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[DIV_I]]
+//
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
   return vdivq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vdivq_f32
-// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
-// CONSTRAINED:   [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <4 x float> [[DIV_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[DIV_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[DIV_I]]
+//
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
   return vdivq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vdiv_f32
-// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
-// CONSTRAINED:   [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <2 x float> [[DIV_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    ret <2 x float> [[DIV_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <2 x float> [[DIV_I]]
+//
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vceq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// COMMONIR:      ret <2 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// UNCONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
   return vceq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vceq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
-// CONSTRAINED:   [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// COMMONIR:      ret <1 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
   return vceq_f64(a, b);
 }
 
-// COMMON-LABEL: test_vceqq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// COMMONIR:      ret <4 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// UNCONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
   return vceqq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vceqq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// COMMONIR:      ret <2 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// UNCONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
   return vceqq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vcge_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// COMMONIR:      ret <2 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// UNCONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
   return vcge_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcge_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
-// CONSTRAINED:   [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// COMMONIR:      ret <1 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
   return vcge_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcgeq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// COMMONIR:      ret <4 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// UNCONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgeq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcgeq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// COMMONIR:      ret <2 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// UNCONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgeq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vcle_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// COMMONIR:      ret <2 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// UNCONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
   return vcle_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcle_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
-// CONSTRAINED:   [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// COMMONIR:      ret <1 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
   return vcle_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcleq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// COMMONIR:      ret <4 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// UNCONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
   return vcleq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcleq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// COMMONIR:      ret <2 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// UNCONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
   return vcleq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vcgt_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// COMMONIR:      ret <2 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// UNCONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
   return vcgt_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcgt_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
-// CONSTRAINED:   [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// COMMONIR:      ret <1 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
   return vcgt_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcgtq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// COMMONIR:      ret <4 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// UNCONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgtq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcgtq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// COMMONIR:      ret <2 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// UNCONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgtq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vclt_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// COMMONIR:      ret <2 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// UNCONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CONSTRAINED-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
   return vclt_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vclt_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
-// CONSTRAINED:   [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// COMMONIR:      ret <1 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CONSTRAINED-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
   return vclt_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcltq_f32
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// COMMONIR:      ret <4 x i32> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// UNCONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CONSTRAINED-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
   return vcltq_f32(v1, v2);
 }
 
-// COMMON-LABEL: test_vcltq_f64
-// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
-// CONSTRAINED:   [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// COMMONIR:      ret <2 x i64> [[SEXT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]]
+// UNCONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// UNCONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CONSTRAINED-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
   return vcltq_f64(v1, v2);
 }
 
-// COMMON-LABEL: test_vpadds_f32
-// COMMONIR:      [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0
-// COMMONIR:      [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1
-// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
-// CONSTRAINED:   [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict"
-// COMMONIR:      ret float [[VPADDD_I]]
+// UNCONSTRAINED-LABEL: define dso_local float @test_vpadds_f32(
+// UNCONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0
+// UNCONSTRAINED-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1
+// UNCONSTRAINED-NEXT:    [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
+// UNCONSTRAINED-NEXT:    ret float [[VPADDD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local float @test_vpadds_f32(
+// CONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0
+// CONSTRAINED-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1
+// CONSTRAINED-NEXT:    [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret float [[VPADDD_I]]
+//
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
 }
 
-// COMMON-LABEL: test_vpaddd_f64
-// COMMONIR:      [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0
-// COMMONIR:      [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1
-// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
-// CONSTRAINED:   [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret double [[VPADDD_I]]
+// UNCONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0
+// UNCONSTRAINED-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1
+// UNCONSTRAINED-NEXT:    [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
+// UNCONSTRAINED-NEXT:    ret double [[VPADDD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0
+// CONSTRAINED-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1
+// CONSTRAINED-NEXT:    [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret double [[VPADDD_I]]
+//
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
 }
 
-// COMMON-LABEL: test_vcvts_f32_s32
-// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i32 %a to float
-// CONSTRAINED:   [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret float [[TMP0]]
+// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32(
+// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = sitofp i32 [[A]] to float
+// UNCONSTRAINED-NEXT:    ret float [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32(
+// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vcvts_f32_s32(int32_t a) {
   return vcvts_f32_s32(a);
 }
 
-// COMMON-LABEL: test_vcvtd_f64_s64
-// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i64 %a to double
-// CONSTRAINED:   [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret double [[TMP0]]
+// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64(
+// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = sitofp i64 [[A]] to double
+// UNCONSTRAINED-NEXT:    ret double [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64(
+// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vcvtd_f64_s64(int64_t a) {
   return vcvtd_f64_s64(a);
 }
 
-// COMMON-LABEL: test_vcvts_f32_u32
-// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i32 %a to float
-// CONSTRAINED:   [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret float [[TMP0]]
+// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32(
+// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = uitofp i32 [[A]] to float
+// UNCONSTRAINED-NEXT:    ret float [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32(
+// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vcvts_f32_u32(uint32_t a) {
   return vcvts_f32_u32(a);
 }
 
 // XXX should verify the type of registers
-// COMMON-LABEL: test_vcvtd_f64_u64
-// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i64 %a to double
-// CONSTRAINED:   [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret double [[TMP0]]
+// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64(
+// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = uitofp i64 [[A]] to double
+// UNCONSTRAINED-NEXT:    ret double [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64(
+// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vcvtd_f64_u64(uint64_t a) {
   return vcvtd_f64_u64(a);
 }
 
-// COMMON-LABEL: test_vceqs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
   return (uint32_t)vceqs_f32(a, b);
 }
 
-// COMMON-LABEL: test_vceqd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
   return (uint64_t)vceqd_f64(a, b);
 }
 
-// COMMON-LABEL: test_vceqzs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCEQZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCEQZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCEQZ_I]]
+//
 uint32_t test_vceqzs_f32(float32_t a) {
   return (uint32_t)vceqzs_f32(a);
 }
 
-// COMMON-LABEL: test_vceqzd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCEQZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCEQZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCEQZ_I]]
+//
 uint64_t test_vceqzd_f64(float64_t a) {
   return (uint64_t)vceqzd_f64(a);
 }
 
-// COMMON-LABEL: test_vcges_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oge float [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcges_f32(float32_t a, float32_t b) {
   return (uint32_t)vcges_f32(a, b);
 }
 
-// COMMON-LABEL: test_vcged_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oge double [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcged_f64(float64_t a, float64_t b) {
   return (uint64_t)vcged_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcgezs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCGEZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCGEZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCGEZ_I]]
+//
 uint32_t test_vcgezs_f32(float32_t a) {
   return (uint32_t)vcgezs_f32(a);
 }
 
-// COMMON-LABEL: test_vcgezd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCGEZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCGEZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCGEZ_I]]
+//
 uint64_t test_vcgezd_f64(float64_t a) {
   return (uint64_t)vcgezd_f64(a);
 }
 
-// COMMON-LABEL: test_vcgts_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcgts_f32(a, b);
 }
 
-// COMMON-LABEL: test_vcgtd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcgtd_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcgtzs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCGTZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCGTZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCGTZ_I]]
+//
 uint32_t test_vcgtzs_f32(float32_t a) {
   return (uint32_t)vcgtzs_f32(a);
 }
 
-// COMMON-LABEL: test_vcgtzd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCGTZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCGTZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCGTZ_I]]
+//
 uint64_t test_vcgtzd_f64(float64_t a) {
   return (uint64_t)vcgtzd_f64(a);
 }
 
-// COMMON-LABEL: test_vcles_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ole float [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcles_f32(float32_t a, float32_t b) {
   return (uint32_t)vcles_f32(a, b);
 }
 
-// COMMON-LABEL: test_vcled_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ole double [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcled_f64(float64_t a, float64_t b) {
   return (uint64_t)vcled_f64(a, b);
 }
 
-// COMMON-LABEL: test_vclezs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCLEZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCLEZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCLEZ_I]]
+//
 uint32_t test_vclezs_f32(float32_t a) {
   return (uint32_t)vclezs_f32(a);
 }
 
-// COMMON-LABEL: test_vclezd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCLEZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCLEZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCLEZ_I]]
+//
 uint64_t test_vclezd_f64(float64_t a) {
   return (uint64_t)vclezd_f64(a);
 }
 
-// COMMON-LABEL: test_vclts_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp olt float [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vclts_f32(float32_t a, float32_t b) {
   return (uint32_t)vclts_f32(a, b);
 }
 
-// COMMON-LABEL: test_vcltd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, %b
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCMPD_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp olt double [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcltd_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcltzs_f32
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// COMMONIR:      ret i32 [[VCLTZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// UNCONSTRAINED-NEXT:    ret i32 [[VCLTZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CONSTRAINED-NEXT:    ret i32 [[VCLTZ_I]]
+//
 uint32_t test_vcltzs_f32(float32_t a) {
   return (uint32_t)vcltzs_f32(a);
 }
 
-// COMMON-LABEL: test_vcltzd_f64
-// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
-// CONSTRAINED:   [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict")
-// COMMONIR:      [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// COMMONIR:      ret i64 [[VCLTZ_I]]
+// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00
+// UNCONSTRAINED-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// UNCONSTRAINED-NEXT:    ret i64 [[VCLTZ_I]]
+//
+// CONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CONSTRAINED-NEXT:    ret i64 [[VCLTZ_I]]
+//
 uint64_t test_vcltzd_f64(float64_t a) {
   return (uint64_t)vcltzd_f64(a);
 }
 
-// COMMON-LABEL: test_vadd_f64
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, %b
-// CONSTRAINED:   [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[ADD_I]]
+//
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
   return vadd_f64(a, b);
 }
 
-// COMMON-LABEL: test_vmul_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %a, %b
-// CONSTRAINED:   [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[MUL_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[MUL_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[MUL_I]]
+//
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
   return vmul_f64(a, b);
 }
 
-// COMMON-LABEL: test_vdiv_f64
-// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
-// CONSTRAINED:   [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[DIV_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[DIV_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[DIV_I]]
+//
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
   return vdiv_f64(a, b);
 }
 
-// COMMON-LABEL: test_vmla_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c
-// CONSTRAINED:   [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
-// CONSTRAINED:   [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[ADD_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]]
+// UNCONSTRAINED-NEXT:    [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[ADD_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[ADD_I]]
+//
 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmla_f64(a, b, c);
 }
 
-// COMMON-LABEL: test_vmls_f64
-// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c
-// CONSTRAINED:   [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
-// CONSTRAINED:   [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[SUB_I]]
+//
 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmls_f64(a, b, c);
 }
 
-// COMMON-LABEL: test_vfma_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
-// CONSTRAINED:   [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfma_f64(a, b, c);
 }
 
-// COMMON-LABEL: test_vfms_f64
-// COMMONIR:      [[SUB_I:%.*]] = fneg <1 x double> %b
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a)
-// CONSTRAINED:   [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[TMP3]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfms_f64(a, b, c);
 }
 
-// COMMON-LABEL: test_vsub_f64
-// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, %b
-// CONSTRAINED:   [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[SUB_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]]
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[SUB_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[SUB_I]]
+//
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
   return vsub_f64(a, b);
 }
 
-// COMMON-LABEL: test_vcvt_s64_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a)
-// COMMONIR:      ret <1 x i64> [[TMP1]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]]) #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   return vcvt_s64_f64(a);
 }
 
-// COMMON-LABEL: test_vcvt_u64_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a)
-// COMMONIR:      ret <1 x i64> [[TMP1]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]]) #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   return vcvt_u64_f64(a);
 }
 
-// COMMON-LABEL: test_vcvt_f64_s64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// UNCONSTRAINED: [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double>
-// CONSTRAINED:   [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VCVT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64(
+// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VCVT_I:%.*]] = sitofp <1 x i64> [[A]] to <1 x double>
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VCVT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64(
+// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VCVT_I]]
+//
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
   return vcvt_f64_s64(a);
 }
 
-// COMMON-LABEL: test_vcvt_f64_u64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// UNCONSTRAINED: [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double>
-// CONSTRAINED:   [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VCVT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64(
+// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VCVT_I:%.*]] = uitofp <1 x i64> [[A]] to <1 x double>
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VCVT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64(
+// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VCVT_I]]
+//
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
   return vcvt_f64_u64(a);
 }
 
-// COMMON-LABEL: test_vrnda_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> %a, metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDA1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDA1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDA1_I]]
+//
 float64x1_t test_vrnda_f64(float64x1_t a) {
   return vrnda_f64(a);
 }
 
-// COMMON-LABEL: test_vrndp_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> %a, metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDP1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDP1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDP1_I]]
+//
 float64x1_t test_vrndp_f64(float64x1_t a) {
   return vrndp_f64(a);
 }
 
-// COMMON-LABEL: test_vrndm_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> %a, metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDM1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDM1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDM1_I]]
+//
 float64x1_t test_vrndm_f64(float64x1_t a) {
   return vrndm_f64(a);
 }
 
-// COMMON-LABEL: test_vrndx_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDX1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDX1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDX1_I]]
+//
 float64x1_t test_vrndx_f64(float64x1_t a) {
   return vrndx_f64(a);
 }
 
-// COMMON-LABEL: test_vrnd_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> %a, metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDZ1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDZ1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDZ1_I]]
+//
 float64x1_t test_vrnd_f64(float64x1_t a) {
   return vrnd_f64(a);
 }
 
-// COMMON-LABEL: test_vrndi_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VRNDI1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VRNDI_V1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VRNDI_V1_I]]
+//
 float64x1_t test_vrndi_f64(float64x1_t a) {
   return vrndi_f64(a);
 }
 
-// COMMON-LABEL: test_vsqrt_f64
-// COMMONIR:      [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
-// CONSTRAINED:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// COMMONIR:      ret <1 x double> [[VSQRT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[VSQRT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[VSQRT_I]]
+//
 float64x1_t test_vsqrt_f64(float64x1_t a) {
   return vsqrt_f64(a);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 271ae056308d2f..3cfa0ac3aa6b1c 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -1,17418 +1,22471 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:     -disable-O0-optnone \
 // RUN:  -flax-vector-conversions=none -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vadd_s8(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
   return vadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_s16(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
   return vadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_s32(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
   return vadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_s64(
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_s64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
   return vadd_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_f32(
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vadd_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
   return vadd_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_u8(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_u16(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_u32(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vadd_u64(
-// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
-// CHECK:   ret <1 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_u64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
+//
 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
   return vadd_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_s8(
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_s16(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_s32(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_s64(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
   return vaddq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_f32(
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vaddq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
   return vaddq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_f64(
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
-// CHECK:   ret <2 x double> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vaddq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x double> [[ADD_I]]
+//
 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
   return vaddq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_u8(
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_u16(
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_u32(
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vaddq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaddq_u64(
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vaddq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
   return vsub_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
   return vsub_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
   return vsub_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
-// CHECK:   ret <1 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_s64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <1 x i64> [[SUB_I]]
+//
 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
   return vsub_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_f32(
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vsub_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
   return vsub_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_u8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
   return vsub_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_u16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
   return vsub_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_u32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
   return vsub_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsub_u64(
-// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
-// CHECK:   ret <1 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_u64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <1 x i64> [[SUB_I]]
+//
 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
   return vsub_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
   return vsubq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
   return vsubq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubq_s32(int32x4_t v1, int32x4_t v2) {
   return vsubq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
   return vsubq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_f32(
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vsubq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
   return vsubq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_f64(
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
-// CHECK:   ret <2 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vsubq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x double> [[SUB_I]]
+//
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
   return vsubq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_u8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vsubq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_u16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vsubq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_u32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vsubq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vsubq_u64(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vsubq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[MUL_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[MUL_I]]
+//
 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
   return vmul_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
   return vmul_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
   return vmul_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
-// CHECK:   ret <2 x float> [[MUL_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmul_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x float> [[MUL_I]]
+//
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
   return vmul_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
-// CHECK:   ret <8 x i8> [[MUL_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i8> [[MUL_I]]
+//
 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
   return vmul_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
-// CHECK:   ret <4 x i16> [[MUL_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
+//
 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
   return vmul_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
-// CHECK:   ret <2 x i32> [[MUL_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
+//
 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
   return vmul_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[MUL_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[MUL_I]]
+//
 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
   return vmulq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
   return vmulq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
   return vmulq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
-// CHECK:   ret <16 x i8> [[MUL_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <16 x i8> [[MUL_I]]
+//
 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vmulq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
-// CHECK:   ret <8 x i16> [[MUL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
+//
 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vmulq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
-// CHECK:   ret <4 x i32> [[MUL_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
+//
 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vmulq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
-// CHECK:   ret <4 x float> [[MUL_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmulq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x float> [[MUL_I]]
+//
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
   return vmulq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
-// CHECK:   ret <2 x double> [[MUL_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmulq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x double> [[MUL_I]]
+//
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
   return vmulq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmul_p8(
-// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VMUL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_p8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VMUL_V_I]]
+//
 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
   return vmul_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmulq_p8(
-// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_p8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VMULQ_V_I]]
+//
 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
   return vmulq_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vmla_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vmla_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return (int8x8_t)vmla_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vmla_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmla_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmla_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vmla_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmla_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vmla_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmla_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vmla_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vmlaq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vmlaq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vmlaq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlaq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlaq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vmlaq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vmlaq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vmlaq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlaq_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
-// CHECK:   ret <2 x double> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmlaq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x double> [[ADD_I]]
+//
 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlaq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vmls_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return (int8x8_t)vmls_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vmls_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmls_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vmls_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vmls_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmls_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vmls_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmls_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vmls_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_s8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vmlsq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_s16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vmlsq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_s32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vmlsq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_f32(
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlsq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vmlsq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_u8(
-// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vmlsq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_u16(
-// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vmlsq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_u32(
-// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vmlsq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vmlsq_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
-// CHECK:   ret <2 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmlsq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x double> [[SUB_I]]
+//
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vmlsq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfma_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1)
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vfma_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfma_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfmaq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1)
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmaq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfmaq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1)
-// CHECK:   ret <2 x double> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmaq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfms_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %v2
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1)
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vfms_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfmsq_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %v2
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1)
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vfmsq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vfmsq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vfmsq_f64(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x double> %v2
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1)
-// CHECK:   ret <2 x double> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vfmsq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vfmsq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vdivq_f64(
-// CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
-// CHECK:   ret <2 x double> [[DIV_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vdivq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x double> [[DIV_I]]
+//
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
   return vdivq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vdivq_f32(
-// CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
-// CHECK:   ret <4 x float> [[DIV_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vdivq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <4 x float> [[DIV_I]]
+//
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
   return vdivq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vdiv_f32(
-// CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
-// CHECK:   ret <2 x float> [[DIV_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vdiv_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    ret <2 x float> [[DIV_I]]
+//
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
   return vdiv_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vaba_s8(
-// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vaba_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vaba_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v2, <4 x i16> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[V2]], <4 x i16> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return vaba_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vaba_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v2, <2 x i32> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[V2]], <2 x i32> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vaba_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vaba_u8(
-// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
-// CHECK:   ret <8 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]]
+// CHECK-NEXT:    ret <8 x i8> [[ADD_I]]
+//
 uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vaba_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vaba_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v2, <4 x i16> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
-// CHECK:   ret <4 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[V2]], <4 x i16> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
+//
 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vaba_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vaba_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v2, <2 x i32> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
-// CHECK:   ret <2 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[V2]], <2 x i32> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
+//
 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vaba_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_s8(
-// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vabaq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v2, <8 x i16> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[V2]], <8 x i16> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vabaq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v2, <4 x i32> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[V2]], <4 x i32> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vabaq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_u8(
-// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
-// CHECK:   ret <16 x i8> [[ADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]]
+// CHECK-NEXT:    ret <16 x i8> [[ADD_I]]
+//
 uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vabaq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v2, <8 x i16> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[V2]], <8 x i16> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vabaq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabaq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v2, <4 x i32> %v3)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[V2]], <4 x i32> [[V3]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
   return vabaq_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vabd_s8(
-// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VABD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VABD_I]]
+//
 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
   return vabd_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   ret <4 x i16> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VABD2_I]]
+//
 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
   return vabd_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   ret <2 x i32> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VABD2_I]]
+//
 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
   return vabd_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_u8(
-// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VABD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VABD_I]]
+//
 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vabd_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   ret <4 x i16> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VABD2_I]]
+//
 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vabd_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   ret <2 x i32> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VABD2_I]]
+//
 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vabd_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabd_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %v1, <2 x float> %v2)
-// CHECK:   ret <2 x float> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vabd_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[V1]], <2 x float> [[V2]])
+// CHECK-NEXT:    ret <2 x float> [[VABD2_I]]
+//
 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
   return vabd_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_s8(
-// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VABD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VABD_I]]
+//
 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
   return vabdq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   ret <8 x i16> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VABD2_I]]
+//
 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
   return vabdq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   ret <4 x i32> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VABD2_I]]
+//
 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
   return vabdq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_u8(
-// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VABD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VABD_I]]
+//
 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vabdq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   ret <8 x i16> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VABD2_I]]
+//
 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vabdq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   ret <4 x i32> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VABD2_I]]
+//
 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vabdq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %v1, <4 x float> %v2)
-// CHECK:   ret <4 x float> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vabdq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[V1]], <4 x float> [[V2]])
+// CHECK-NEXT:    ret <4 x float> [[VABD2_I]]
+//
 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
   return vabdq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vabdq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %v1, <2 x double> %v2)
-// CHECK:   ret <2 x double> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vabdq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[V1]], <2 x double> [[V2]])
+// CHECK-NEXT:    ret <2 x double> [[VABD2_I]]
+//
 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
   return vabdq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vbsl_s8(
-// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <8 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <8 x i8> [[VBSL2_I]]
+//
 int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
   return vbsl_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP4]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+//
 int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
   return (int8x8_t)vbsl_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <2 x i32> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <2 x i32> [[VBSL5_I]]
+//
 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
   return vbsl_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <1 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_s64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <1 x i64> [[VBSL5_I]]
+//
 int64x1_t test_vbsl_s64(uint64x1_t v1, int64x1_t v2, int64x1_t v3) {
   return vbsl_s64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_u8(
-// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <8 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <8 x i8> [[VBSL2_I]]
+//
 uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
   return vbsl_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <4 x i16> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <4 x i16> [[VBSL5_I]]
+//
 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
   return vbsl_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <2 x i32> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <2 x i32> [[VBSL5_I]]
+//
 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
   return vbsl_u32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <1 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_u64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <1 x i64> [[VBSL5_I]]
+//
 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
   return vbsl_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> %v1, [[VBSL1_I]]
-// CHECK:   [[TMP4:%.*]] = xor <2 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
-// CHECK:   ret <2 x float> [[TMP5]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vbsl_f32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP1]]
+//
 float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) {
   return vbsl_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %v1, [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
-// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP4]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vbsl_f64(
+// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]], <1 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <1 x double> [[V2]] to <1 x i64>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <1 x double> [[V3]] to <1 x i64>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP1]]
+//
 float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
   return vbsl_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_p8(
-// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <8 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_p8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <8 x i8> [[VBSL2_I]]
+//
 poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
   return vbsl_p8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbsl_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <4 x i16> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_p16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <4 x i16> [[VBSL5_I]]
+//
 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
   return vbsl_p16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_s8(
-// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <16 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <16 x i8> [[VBSL2_I]]
+//
 int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
   return vbslq_s8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <8 x i16> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <8 x i16> [[VBSL5_I]]
+//
 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
   return vbslq_s16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <4 x i32> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <4 x i32> [[VBSL5_I]]
+//
 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vbslq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <2 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <2 x i64> [[VBSL5_I]]
+//
 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
   return vbslq_s64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_u8(
-// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <16 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <16 x i8> [[VBSL2_I]]
+//
 uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
   return vbslq_u8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <8 x i16> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <8 x i16> [[VBSL5_I]]
+//
 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
   return vbslq_u16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <4 x i32> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <4 x i32> [[VBSL5_I]]
+//
 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
   return vbslq_s32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <2 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <2 x i64> [[VBSL5_I]]
+//
 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
   return vbslq_u64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> %v1, [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
-// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
-// CHECK:   ret <4 x float> [[TMP4]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vbslq_f32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+//
 float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
   return vbslq_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_p8(
-// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1)
-// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
-// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
-// CHECK:   ret <16 x i8> [[VBSL2_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_p8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1)
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK-NEXT:    ret <16 x i8> [[VBSL2_I]]
+//
 poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
   return vbslq_p8(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
-// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3
-// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <8 x i16> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_p16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <8 x i16> [[VBSL5_I]]
+//
 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
   return vbslq_p16(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vbslq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
-// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %v1, [[VBSL1_I]]
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
-// CHECK:   ret <2 x double> [[TMP4]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vbslq_f64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP1]]
+//
 float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
   return vbslq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: @test_vrecps_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %v1, <2 x float> %v2)
-// CHECK:   ret <2 x float> [[VRECPS_V2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrecps_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]])
+// CHECK-NEXT:    ret <2 x float> [[VRECPS_V2_I]]
+//
 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
   return vrecps_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrecpsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %v1, <4 x float> %v2)
-// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VRECPSQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrecpsq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]])
+// CHECK-NEXT:    ret <4 x float> [[VRECPSQ_V2_I]]
+//
 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
   return vrecpsq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrecpsq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %v1, <2 x double> %v2)
-// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x double> [[VRECPSQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrecpsq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]])
+// CHECK-NEXT:    ret <2 x double> [[VRECPSQ_V2_I]]
+//
 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
   return vrecpsq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrsqrts_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v1, <2 x float> %v2)
-// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VRSQRTS_V2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrts_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[V1]], <2 x float> [[V2]])
+// CHECK-NEXT:    ret <2 x float> [[VRSQRTS_V2_I]]
+//
 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
   return vrsqrts_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrsqrtsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v1, <4 x float> %v2)
-// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VRSQRTSQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrtsq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[V1]], <4 x float> [[V2]])
+// CHECK-NEXT:    ret <4 x float> [[VRSQRTSQ_V2_I]]
+//
 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
   return vrsqrtsq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrsqrtsq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %v1, <2 x double> %v2)
-// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x double> [[VRSQRTSQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrtsq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[V1]], <2 x double> [[V2]])
+// CHECK-NEXT:    ret <2 x double> [[VRSQRTSQ_V2_I]]
+//
 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
   return vrsqrtsq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcage_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2)
-// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcage_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[V1]], <2 x float> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VCAGE_V2_I]]
+//
 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
   return vcage_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcage_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcage_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VCAGE_V2_I]]
+//
 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
   return vcage_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcageq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2)
-// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcageq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[V1]], <4 x float> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VCAGEQ_V2_I]]
+//
 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
   return vcageq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcageq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2)
-// CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcageq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[V1]], <2 x double> [[V2]])
+// CHECK-NEXT:    ret <2 x i64> [[VCAGEQ_V2_I]]
+//
 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
   return vcageq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcagt_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2)
-// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcagt_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[V1]], <2 x float> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VCAGT_V2_I]]
+//
 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
   return vcagt_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcagt_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcagt_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VCAGT_V2_I]]
+//
 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
   return vcagt_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcagtq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2)
-// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcagtq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[V1]], <4 x float> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VCAGTQ_V2_I]]
+//
 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
   return vcagtq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcagtq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2)
-// CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcagtq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[V1]], <2 x double> [[V2]])
+// CHECK-NEXT:    ret <2 x i64> [[VCAGTQ_V2_I]]
+//
 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
   return vcagtq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcale_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1)
-// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcale_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[V2]], <2 x float> [[V1]])
+// CHECK-NEXT:    ret <2 x i32> [[VCALE_V2_I]]
+//
 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
   return vcale_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vcale_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %b, <1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCALE_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcale_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[B]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCALE_V2_I]]
+//
 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
   return vcale_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcaleq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1)
-// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcaleq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[V2]], <4 x float> [[V1]])
+// CHECK-NEXT:    ret <4 x i32> [[VCALEQ_V2_I]]
+//
 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
   return vcaleq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vcaleq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1)
-// CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcaleq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[V2]], <2 x double> [[V1]])
+// CHECK-NEXT:    ret <2 x i64> [[VCALEQ_V2_I]]
+//
 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
   return vcaleq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vcalt_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1)
-// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcalt_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[V2]], <2 x float> [[V1]])
+// CHECK-NEXT:    ret <2 x i32> [[VCALT_V2_I]]
+//
 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
   return vcalt_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vcalt_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCALT_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcalt_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[B]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCALT_V2_I]]
+//
 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
   return vcalt_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcaltq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1)
-// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcaltq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[V2]], <4 x float> [[V1]])
+// CHECK-NEXT:    ret <4 x i32> [[VCALTQ_V2_I]]
+//
 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
   return vcaltq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vcaltq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
-// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1)
-// CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcaltq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[V2]], <2 x double> [[V1]])
+// CHECK-NEXT:    ret <2 x i64> [[VCALTQ_V2_I]]
+//
 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
   return vcaltq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
 }
 
-// CHECK-LABEL: @test_vtst_s8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
   return vtst_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
   return vtst_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VTST_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
+//
 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
   return vtst_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_u8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
   return vtst_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
   return vtst_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VTST_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
+//
 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
   return vtst_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_s8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
   return vtstq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
   return vtstq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VTST_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
+//
 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
   return vtstq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_u8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vtstq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vtstq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i32> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VTST_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
+//
 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vtstq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VTST_I]]
+//
 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
   return vtstq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <2 x i64> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VTST_I]]
+//
 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vtstq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_p8(
-// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_p8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VTST_I]]
+//
 uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
   return vtst_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <4 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_p16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
+//
 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
   return vtst_p16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_p8(
-// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
-// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VTST_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_p8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VTST_I]]
+//
 uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
   return vtstq_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtstq_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i16> %v1, %v2
-// CHECK:   [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VTST_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_p16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
+//
 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
   return vtstq_p16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vtst_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VTST_I]]
+//
 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
   return vtst_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vtst_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <1 x i64> %a, %b
-// CHECK:   [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VTST_I]]
+//
 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
   return vtst_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
   return vceq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
   return vceq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
   return vceq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
   return vceq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
   return vceq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
   return vceq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
   return vceq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vceq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
   return vceq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
   return vceq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
   return vceq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceq_p8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_p8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
   return vceq_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
   return vceqq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
   return vceqq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
   return vceqq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
   return vceqq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vceqq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vceqq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vceqq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_p8(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_p8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
   return vceqq_p8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
   return vceqq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vceqq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vceqq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
   return vceqq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
   return vcge_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
   return vcge_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
   return vcge_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
   return vcge_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
   return vcge_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
   return vcge_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
   return vcge_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcge_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcge_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcge_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcge_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcge_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
   return vcgeq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
   return vcgeq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
   return vcgeq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgeq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcgeq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcgeq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcgeq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sge <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
   return vcgeq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp uge <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcgeq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgeq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgeq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
 // Notes about vcle:
 // LE condition predicate implemented as GE, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
   return vcle_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
   return vcle_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
   return vcle_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
   return vcle_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
   return vcle_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
   return vcle_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
   return vcle_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcle_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcle_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcle_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcle_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcle_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
   return vcleq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
   return vcleq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
   return vcleq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
   return vcleq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcleq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcleq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcleq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sle <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
   return vcleq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ule <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcleq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcleq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
   return vcleq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
   return vcgt_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
   return vcgt_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
   return vcgt_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
   return vcgt_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
   return vcgt_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
   return vcgt_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
   return vcgt_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgt_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
   return vcgt_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
   return vcgt_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgt_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
   return vcgt_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
   return vcgtq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
   return vcgtq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
   return vcgtq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
   return vcgtq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcgtq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcgtq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcgtq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
   return vcgtq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcgtq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcgtq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
   return vcgtq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
 // Notes about vclt:
 // LT condition predicate implemented as GT, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
+// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
   return vclt_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
   return vclt_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
   return vclt_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
   return vclt_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
   return vclt_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_f32(
+// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
   return vclt_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
   return vclt_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vclt_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <8 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[SEXT_I]]
+//
 uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
   return vclt_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
   return vclt_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vclt_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <2 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[SEXT_I]]
+//
 uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
   return vclt_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_s8(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
   return vcltq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_s16(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
   return vcltq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_s32(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
   return vcltq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_f32(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_f32(
+// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
   return vcltq_f32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_u8(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <16 x i8> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[SEXT_I]]
+//
 uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vcltq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_u16(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <8 x i16> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vcltq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_u32(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <4 x i32> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[SEXT_I]]
+//
 uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vcltq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_s64(
-// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_s64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
   return vcltq_s64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_u64(
-// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_u64(
+// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult <2 x i64> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
   return vcltq_u64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vcltq_f64(
-// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_f64(
+// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
   return vcltq_f64(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_s8(
-// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VHADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
+//
 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
   return vhadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
+//
 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
   return vhadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
+//
 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
   return vhadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_u8(
-// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VHADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VHADD_V_I]]
+//
 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vhadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
+//
 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vhadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
+//
 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vhadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_s8(
-// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
+//
 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vhaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
+//
 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vhaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
+//
 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vhaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_u8(
-// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VHADDQ_V_I]]
+//
 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vhaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
+//
 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vhaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
+//
 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vhaddq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_s8(
-// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VHSUB_V_I]]
+//
 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
   return vhsub_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
+//
 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
   return vhsub_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
+//
 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
   return vhsub_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_u8(
-// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VHSUB_V_I]]
+//
 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
   return vhsub_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VHSUB_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
+//
 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
   return vhsub_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsub_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VHSUB_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
+//
 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
   return vhsub_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_s8(
-// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VHSUBQ_V_I]]
+//
 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
   return vhsubq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
+//
 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
   return vhsubq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
+//
 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
   return vhsubq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_u8(
-// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VHSUBQ_V_I]]
+//
 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vhsubq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
+//
 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vhsubq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vhsubq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VHSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
+//
 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vhsubq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_s8(
-// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
+//
 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
   return vrhadd_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
+//
 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
   return vrhadd_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
+//
 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
   return vrhadd_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_u8(
-// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2)
-// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]])
+// CHECK-NEXT:    ret <8 x i8> [[VRHADD_V_I]]
+//
 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
   return vrhadd_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRHADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]])
+// CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
+//
 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
   return vrhadd_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
-// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2)
-// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRHADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]])
+// CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
+//
 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
   return vrhadd_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s8(
-// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
+//
 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
   return vrhaddq_s8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
+//
 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
   return vrhaddq_s16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
+//
 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
   return vrhaddq_s32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u8(
-// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2)
-// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]])
+// CHECK-NEXT:    ret <16 x i8> [[VRHADDQ_V_I]]
+//
 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
   return vrhaddq_u8(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]])
+// CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
+//
 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
   return vrhaddq_u16(v1, v2);
 }
 
-// CHECK-LABEL: @test_vrhaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
-// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2)
-// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRHADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]])
+// CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
+//
 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
   return vrhaddq_u32(v1, v2);
 }
 
-// CHECK-LABEL: @test_vqadd_s8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQADD_V_I]]
+//
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
+//
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQADD_V_I]]
+//
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
+//
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
+//
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqadd_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQADD_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
+//
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQADDQ_V_I]]
+//
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
+//
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQADDQ_V_I]]
+//
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
+//
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
+//
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
+//
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSUB_V_I]]
+//
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
+//
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSUB_V_I]]
+//
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
+//
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
+//
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsub_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
+//
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSUBQ_V_I]]
+//
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
+//
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSUBQ_V_I]]
+//
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
+//
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
+//
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
+//
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s8(
-// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_V_I]]
+//
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
+//
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
+//
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
+//
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u8(
-// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_V_I]]
+//
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
+//
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
+//
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
+//
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s8(
-// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VSHLQ_V_I]]
+//
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
+//
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
+//
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
+//
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u8(
-// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VSHLQ_V_I]]
+//
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
+//
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
+//
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
+//
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s8(
-// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_V_I]]
+//
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
+//
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
+//
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
+//
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u8(
-// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_V_I]]
+//
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
+//
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
+//
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
+//
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s8(
-// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLQ_V_I]]
+//
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
+//
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
+//
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
+//
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u8(
-// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLQ_V_I]]
+//
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
+//
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
+//
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
+//
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s8(
-// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSHL_V_I]]
+//
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
+//
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
+//
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
+//
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u8(
-// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSHL_V_I]]
+//
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
+//
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
+//
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
+//
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s8(
-// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRSHLQ_V_I]]
+//
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
+//
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
+//
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
+//
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u8(
-// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VRSHLQ_V_I]]
+//
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
+//
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
+//
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
+//
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s8(
-// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHL_V_I]]
+//
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
+//
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
+//
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
+//
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u8(
-// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHL_V_I]]
+//
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
+//
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
+//
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshl_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQRSHL_V2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
+//
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s8(
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQRSHLQ_V_I]]
+//
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
+//
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
+//
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
+//
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u8(
-// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VQRSHLQ_V_I]]
+//
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
+//
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
+//
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQRSHLQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
+//
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsli_n_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0)
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 0)
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
   return vsli_n_p64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vsliq_n_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0)
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 0)
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
   return vsliq_n_p64(a, b, 0);
 }
 
-// CHECK-LABEL: @test_vmax_s8(
-// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMAX_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMAX_I]]
+//
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VMAX2_I]]
+//
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VMAX2_I]]
+//
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u8(
-// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMAX_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMAX_I]]
+//
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VMAX2_I]]
+//
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VMAX2_I]]
+//
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmax_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VMAX2_I]]
+//
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s8(
-// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMAX_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMAX_I]]
+//
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMAX2_I]]
+//
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMAX2_I]]
+//
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u8(
-// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMAX_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMAX_I]]
+//
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMAX2_I]]
+//
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMAX2_I]]
+//
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmaxq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VMAX2_I]]
+//
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmaxq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VMAX2_I]]
+//
 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
   return vmaxq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s8(
-// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMIN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMIN_I]]
+//
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VMIN2_I]]
+//
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VMIN2_I]]
+//
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u8(
-// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VMIN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VMIN_I]]
+//
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VMIN2_I]]
+//
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VMIN2_I]]
+//
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmin_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VMIN2_I]]
+//
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s8(
-// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMIN_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMIN_I]]
+//
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMIN2_I]]
+//
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMIN2_I]]
+//
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u8(
-// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VMIN_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VMIN_I]]
+//
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMIN2_I]]
+//
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMIN2_I]]
+//
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vminq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VMIN2_I]]
+//
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vminq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vminq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VMIN2_I]]
+//
 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
   return vminq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxnm_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VMAXNM2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmaxnm_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VMAXNM2_I]]
+//
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vmaxnm_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxnmq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VMAXNM2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmaxnmq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VMAXNM2_I]]
+//
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vmaxnmq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxnmq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VMAXNM2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmaxnmq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VMAXNM2_I]]
+//
 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
   return vmaxnmq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vminnm_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VMINNM2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vminnm_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VMINNM2_I]]
+//
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
   return vminnm_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vminnmq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VMINNM2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vminnmq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VMINNM2_I]]
+//
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
   return vminnmq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vminnmq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VMINNM2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vminnmq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VMINNM2_I]]
+//
 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
   return vminnmq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_s8(
-// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMAX_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMAX_I]]
+//
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPMAX2_I]]
+//
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPMAX2_I]]
+//
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u8(
-// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMAX_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMAX_I]]
+//
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPMAX2_I]]
+//
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPMAX2_I]]
+//
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmax_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vpmax_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VPMAX2_I]]
+//
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_s8(
-// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPMAX_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPMAX_I]]
+//
 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
   return vpmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPMAX2_I]]
+//
 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
   return vpmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPMAX2_I]]
+//
 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
   return vpmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_u8(
-// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPMAX_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPMAX_I]]
+//
 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vpmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPMAX2_I]]
+//
 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vpmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPMAX2_I]]
+//
 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vpmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VPMAX2_I]]
+//
 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
   return vpmaxq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VPMAX2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VPMAX2_I]]
+//
 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
   return vpmaxq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s8(
-// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMIN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMIN_I]]
+//
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPMIN2_I]]
+//
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPMIN2_I]]
+//
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u8(
-// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPMIN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPMIN_I]]
+//
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPMIN2_I]]
+//
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPMIN2_I]]
+//
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmin_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vpmin_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VPMIN2_I]]
+//
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_s8(
-// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPMIN_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPMIN_I]]
+//
 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
   return vpminq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPMIN2_I]]
+//
 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
   return vpminq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPMIN2_I]]
+//
 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
   return vpminq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_u8(
-// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPMIN_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPMIN_I]]
+//
 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
   return vpminq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPMIN2_I]]
+//
 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
   return vpminq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPMIN2_I]]
+//
 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
   return vpminq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vpminq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VPMIN2_I]]
+//
 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
   return vpminq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VPMIN2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vpminq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VPMIN2_I]]
+//
 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
   return vpminq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxnm_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VPMAXNM2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vpmaxnm_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VPMAXNM2_I]]
+//
 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vpmaxnm_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxnmq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VPMAXNM2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxnmq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VPMAXNM2_I]]
+//
 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vpmaxnmq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpmaxnmq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VPMAXNM2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxnmq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VPMAXNM2_I]]
+//
 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
   return vpmaxnmq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vpminnm_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VPMINNM2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vpminnm_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VPMINNM2_I]]
+//
 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
   return vpminnm_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminnmq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VPMINNM2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vpminnmq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VPMINNM2_I]]
+//
 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
   return vpminnmq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpminnmq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VPMINNM2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vpminnmq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VPMINNM2_I]]
+//
 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
   return vpminnmq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s8(
-// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPADD_V_I]]
+//
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
+//
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
+//
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u8(
-// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VPADD_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VPADD_V_I]]
+//
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VPADD_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
+//
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VPADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
+//
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadd_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x float> [[VPADD_V2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vpadd_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VPADD_V2_I]]
+//
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_s8(
-// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPADDQ_V_I]]
+//
 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
   return vpaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDQ_V2_I]]
+//
 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
   return vpaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDQ_V2_I]]
+//
 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
   return vpaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_u8(
-// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VPADDQ_V_I]]
+//
 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vpaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDQ_V2_I]]
+//
 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vpaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDQ_V2_I]]
+//
 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vpaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vpaddq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VPADDQ_V2_I]]
+//
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
   return vpaddq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x double> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vpaddq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VPADDQ_V2_I]]
+//
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
   return vpaddq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulh_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqdmulh_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+//
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulh_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqdmulh_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+//
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqdmulhq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+//
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmulhq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+//
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrdmulh_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+//
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulh_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrdmulh_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+//
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrdmulhq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+//
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrdmulhq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+//
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulx_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %b)
-// CHECK:   ret <2 x float> [[VMULX2_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vmulx_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+// CHECK-NEXT:    ret <2 x float> [[VMULX2_I]]
+//
 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
   return vmulx_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulxq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %b)
-// CHECK:   ret <4 x float> [[VMULX2_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vmulxq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+// CHECK-NEXT:    ret <4 x float> [[VMULX2_I]]
+//
 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
   return vmulxq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulxq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %b)
-// CHECK:   ret <2 x double> [[VMULX2_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vmulxq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+// CHECK-NEXT:    ret <2 x double> [[VMULX2_I]]
+//
 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
   return vmulxq_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vshl_n_s8(
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3)
-// CHECK:   ret <8 x i8> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_N]]
+//
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshl_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <4 x i16> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
+//
 int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshl_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <2 x i32> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
+//
 int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s8(
-// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3)
-// CHECK:   ret <16 x i8> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+//
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+//
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <4 x i32> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+//
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3)
-// CHECK:   ret <2 x i64> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
+//
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: @test_vshl_n_u8(
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3)
-// CHECK:   ret <8 x i8> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSHL_N]]
+//
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshl_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <4 x i16> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
+//
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshl_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <2 x i32> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
+//
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u8(
-// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3)
-// CHECK:   ret <16 x i8> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+//
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+//
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <4 x i32> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+//
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshlq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3)
-// CHECK:   ret <2 x i64> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
+//
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_s8(
-// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 3)
-// CHECK:   ret <8 x i8> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSHR_N]]
+//
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <4 x i16> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
+//
 int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <2 x i32> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
+//
 int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s8(
-// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 3)
-// CHECK:   ret <16 x i8> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSHR_N]]
+//
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
+//
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <4 x i32> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
+//
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 3)
-// CHECK:   ret <2 x i64> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i64> [[A]], splat (i64 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
+//
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_u8(
-// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 3)
-// CHECK:   ret <8 x i8> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSHR_N]]
+//
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <4 x i16> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
+//
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshr_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <2 x i32> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
+//
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u8(
-// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 3)
-// CHECK:   ret <16 x i8> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSHR_N]]
+//
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
+//
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 3)
-// CHECK:   ret <4 x i32> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i32> [[A]], splat (i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
+//
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 3)
-// CHECK:   ret <2 x i64> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i64> [[A]], splat (i64 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
+//
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_s8(
-// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 3)
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 3)
-// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i16> [[TMP4]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 3)
-// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i32> [[TMP4]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i32> [[B]], splat (i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s8(
-// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 3)
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 3)
-// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <8 x i16> [[TMP4]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 3)
-// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i32> [[TMP4]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i32> [[B]], splat (i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 3)
-// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i64> [[TMP4]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i64> [[B]], splat (i64 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_u8(
-// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 3)
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 3)
-// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i16> [[TMP4]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsra_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 3)
-// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i32> [[TMP4]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i32> [[B]], splat (i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u8(
-// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 3)
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 3)
-// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <8 x i16> [[TMP4]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 3)
-// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <4 x i32> [[TMP4]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i32> [[B]], splat (i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsraq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 3)
-// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <2 x i64> [[TMP4]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i64> [[B]], splat (i64 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3))
-// CHECK:   ret <8 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHR_N]]
+//
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3))
-// CHECK:   ret <4 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -3))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
+//
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3))
-// CHECK:   ret <2 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -3))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
+//
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3))
-// CHECK:   ret <16 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3))
+// CHECK-NEXT:    ret <16 x i8> [[VRSHR_N]]
+//
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3))
-// CHECK:   ret <8 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -3))
+// CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
+//
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3))
-// CHECK:   ret <4 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -3))
+// CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
+//
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3))
-// CHECK:   ret <2 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -3))
+// CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
+//
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3))
-// CHECK:   ret <8 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3))
+// CHECK-NEXT:    ret <8 x i8> [[VRSHR_N]]
+//
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3))
-// CHECK:   ret <4 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -3))
+// CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
+//
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3))
-// CHECK:   ret <2 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -3))
+// CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
+//
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3))
-// CHECK:   ret <16 x i8> [[VRSHR_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3))
+// CHECK-NEXT:    ret <16 x i8> [[VRSHR_N]]
+//
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3))
-// CHECK:   ret <8 x i16> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -3))
+// CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
+//
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3))
-// CHECK:   ret <4 x i32> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -3))
+// CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
+//
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3))
-// CHECK:   ret <2 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -3))
+// CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
+//
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3))
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3))
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3))
-// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]]
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <2 x i32> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u8(
-// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3))
-// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <4 x i32> [[TMP3]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrsraq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3))
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <2 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -3))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_s8(
-// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSRI_N]]
+//
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
-// CHECK:   ret <4 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSRI_N2]]
+//
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
-// CHECK:   ret <2 x i32> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSRI_N2]]
+//
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s8(
-// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSRI_N]]
+//
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
-// CHECK:   ret <8 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSRI_N2]]
+//
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
-// CHECK:   ret <4 x i32> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSRI_N2]]
+//
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
-// CHECK:   ret <2 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSRI_N2]]
+//
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_u8(
-// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSRI_N]]
+//
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
-// CHECK:   ret <4 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSRI_N2]]
+//
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
-// CHECK:   ret <2 x i32> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSRI_N2]]
+//
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u8(
-// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSRI_N]]
+//
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
-// CHECK:   ret <8 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSRI_N2]]
+//
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
-// CHECK:   ret <4 x i32> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSRI_N2]]
+//
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
-// CHECK:   ret <2 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSRI_N2]]
+//
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_p8(
-// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSRI_N]]
+//
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsri_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15)
-// CHECK:   ret <4 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 15)
+// CHECK-NEXT:    ret <4 x i16> [[VSRI_N2]]
+//
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsriq_n_p8(
-// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSRI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSRI_N]]
+//
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsriq_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15)
-// CHECK:   ret <8 x i16> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 15)
+// CHECK-NEXT:    ret <8 x i16> [[VSRI_N2]]
+//
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsli_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
-// CHECK:   ret <2 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
+//
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
-// CHECK:   ret <4 x i32> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3)
+// CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
+//
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
-// CHECK:   ret <2 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3)
+// CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
+//
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-// CHECK:   ret <8 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VSLI_N]]
+//
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsli_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15)
-// CHECK:   ret <4 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 15)
+// CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
+//
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vsliq_n_p8(
-// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-// CHECK:   ret <16 x i8> [[VSLI_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3)
+// CHECK-NEXT:    ret <16 x i8> [[VSLI_N]]
+//
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vsliq_n_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15)
-// CHECK:   ret <8 x i16> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 15)
+// CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
+//
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 15);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s8(
-// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 3))
-// CHECK:   ret <8 x i8> [[VQSHLU_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshlu_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 3))
+// CHECK-NEXT:    ret <8 x i8> [[VQSHLU_N]]
+//
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 3))
-// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshlu_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 3))
+// CHECK-NEXT:    ret <4 x i16> [[VQSHLU_N1]]
+//
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 3))
-// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshlu_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 3))
+// CHECK-NEXT:    ret <2 x i32> [[VQSHLU_N1]]
+//
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s8(
-// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 3))
-// CHECK:   ret <16 x i8> [[VQSHLU_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshluq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 3))
+// CHECK-NEXT:    ret <16 x i8> [[VQSHLU_N]]
+//
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 3))
-// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshluq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 3))
+// CHECK-NEXT:    ret <8 x i16> [[VQSHLU_N1]]
+//
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 3))
-// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshluq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 3))
+// CHECK-NEXT:    ret <4 x i32> [[VQSHLU_N1]]
+//
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshluq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 3))
-// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqshluq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 3))
+// CHECK-NEXT:    ret <2 x i64> [[VQSHLU_N1]]
+//
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
+//
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 9)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
+//
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 19)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
+//
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 3)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
+//
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 9)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
+//
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: @test_vshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSHRN_N]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 19)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
+//
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[B]], splat (i32 9)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[B]], splat (i64 19)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[B]], splat (i16 3)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[B]], splat (i32 9)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vshrn_high_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19)
-// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[B]], splat (i64 19)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrun_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRUN_N1]]
+//
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrun_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRUN_N1]]
+//
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqshrun_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrun_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRUN_N1]]
+//
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqshrun_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrun_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqshrun_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqshrun_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrun_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqshrun_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqshrun_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrun_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqshrun_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
+//
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
+//
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
+//
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
+//
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
+//
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: @test_vrshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
+//
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vrshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vrshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vrshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vrshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vrshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vrshrn_high_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vrshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrun_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRUN_N1]]
+//
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrun_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRUN_N1]]
+//
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrun_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrun_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRUN_N1]]
+//
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrun_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrun_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqrshrun_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrun_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrun_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqrshrun_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrun_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrun_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqrshrun_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
+//
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
+//
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
+//
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
+//
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
+//
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
+//
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vqshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vqshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqshrn_high_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vqshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
+//
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
+//
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
+//
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[A]], i32 3)
+// CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
+//
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[A]], i32 9)
+// CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
+//
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrn_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[A]], i32 19)
+// CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
+//
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
   return vqrshrn_high_n_s16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
   return vqrshrn_high_n_s32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
   return vqrshrn_high_n_s64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[B]], i32 3)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
   return vqrshrn_high_n_u16(a, b, 3);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[B]], i32 9)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
   return vqrshrn_high_n_u32(a, b, 9);
 }
 
-// CHECK-LABEL: @test_vqrshrn_high_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[B]], i32 19)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
   return vqrshrn_high_n_u64(a, b, 19);
 }
 
-// CHECK-LABEL: @test_vshll_n_s8(
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshll_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 9)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 9);
 }
 
-// CHECK-LABEL: @test_vshll_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 19)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 19);
 }
 
-// CHECK-LABEL: @test_vshll_n_u8(
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshll_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 9)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 9);
 }
 
-// CHECK-LABEL: @test_vshll_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 19)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 19);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
   return vshll_high_n_s8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 9)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
   return vshll_high_n_s16(a, 9);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 19)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
   return vshll_high_n_s32(a, 19);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 3)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
   return vshll_high_n_u8(a, 3);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 9)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
   return vshll_high_n_u16(a, 9);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 19)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
   return vshll_high_n_u32(a, 19);
 }
 
-// CHECK-LABEL: @test_vmovl_s8(
-// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u8(
-// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I]]
+//
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
+//
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
+//
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vmovl_high_s8(int8x16_t a) {
   return vmovl_high_s8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vmovl_high_s16(int16x8_t a) {
   return vmovl_high_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vmovl_high_s32(int32x4_t a) {
   return vmovl_high_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
   return vmovl_high_u8(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
   return vmovl_high_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovl_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
   return vmovl_high_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
-// CHECK:   ret <2 x float> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[A]], i32 31)
+// CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
+//
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
-// CHECK:   ret <4 x float> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[A]], i32 31)
+// CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
+//
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
-// CHECK:   ret <2 x double> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[A]], i32 50)
+// CHECK-NEXT:    ret <2 x double> [[VCVT_N1]]
+//
 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
   return vcvtq_n_f64_s64(a, 50);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
-// CHECK:   ret <2 x float> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[A]], i32 31)
+// CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
+//
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
-// CHECK:   ret <4 x float> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[A]], i32 31)
+// CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
+//
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
-// CHECK:   ret <2 x double> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[A]], i32 50)
+// CHECK-NEXT:    ret <2 x double> [[VCVT_N1]]
+//
 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
   return vcvtq_n_f64_u64(a, 50);
 }
 
-// CHECK-LABEL: @test_vcvt_n_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
-// CHECK:   ret <2 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[A]], i32 31)
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
+//
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
-// CHECK:   ret <4 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[A]], i32 31)
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
+//
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
-// CHECK:   ret <2 x i64> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[A]], i32 50)
+// CHECK-NEXT:    ret <2 x i64> [[VCVT_N1]]
+//
 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
   return vcvtq_n_s64_f64(a, 50);
 }
 
-// CHECK-LABEL: @test_vcvt_n_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
-// CHECK:   ret <2 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[A]], i32 31)
+// CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
+//
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
-// CHECK:   ret <4 x i32> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[A]], i32 31)
+// CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
+//
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 31);
 }
 
-// CHECK-LABEL: @test_vcvtq_n_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
-// CHECK:   ret <2 x i64> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[A]], i32 50)
+// CHECK-NEXT:    ret <2 x i64> [[VCVT_N1]]
+//
 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
   return vcvtq_n_u64_f64(a, 50);
 }
 
-// CHECK-LABEL: @test_vaddl_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_s8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
   return vaddl_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_s16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
   return vaddl_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_s32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <2 x i64> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
   return vaddl_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_u8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vaddl_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_u16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vaddl_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddl_high_u32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vaddl_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_s8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
   return vaddw_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_s16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
   return vaddw_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_s32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
   return vaddw_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_u8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
   return vaddw_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_u16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
   return vaddw_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddw_high_u32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
   return vaddw_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_s8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
   return vsubl_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_s16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
   return vsubl_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_s32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
   return vsubl_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_u8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vsubl_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_u16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vsubl_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubl_high_u32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64>
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vsubl_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s8(
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u8(
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_s8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
   return vsubw_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_s16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
   return vsubw_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_s32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
   return vsubw_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_u8(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
   return vsubw_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_u16(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
   return vsubw_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubw_high_u32(
-// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP0]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
   return vsubw_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
+//
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
+//
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
+//
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
+//
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
+//
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VADDHN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
+//
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vaddhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vaddhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vaddhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vaddhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vaddhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vaddhn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b
-// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32)
-// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDHN_I_I:%.*]] = add <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32)
+// CHECK-NEXT:    [[VADDHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vaddhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
+//
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
+//
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
+//
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
+//
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
+//
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRADDHN_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
+//
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vraddhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vraddhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vraddhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vraddhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vraddhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vraddhn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vraddhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
+//
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
+//
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
+//
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
+//
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
+//
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
+//
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vsubhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vsubhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vsubhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vsubhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vsubhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vsubhn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b
-// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32)
-// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32)
+// CHECK-NEXT:    [[VSUBHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vsubhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
+//
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
+//
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
+//
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
+//
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
+//
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VRSUBHN_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
+//
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
   return vrsubhn_high_s16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
   return vrsubhn_high_s32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
   return vrsubhn_high_s64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I_I]]
+//
 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
   return vrsubhn_high_u16(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I_I]]
+//
 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
   return vrsubhn_high_u32(r, a, b);
 }
 
-// CHECK-LABEL: @test_vrsubhn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I_I]]
+//
 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
   return vrsubhn_high_u64(r, a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s8(
-// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I]]
+//
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I]]
+//
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I]]
+//
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u8(
-// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I]]
+//
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I]]
+//
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I]]
+//
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vabal_s8(
-// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u8(
-// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabdl_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I_I]]
+//
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
   return vabdl_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I_I]]
+//
 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
   return vabdl_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I_I]]
+//
 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
   return vabdl_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I_I]]
+//
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
   return vabdl_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I_I]]
+//
 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
   return vabdl_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vabdl_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I_I]]
+//
 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
   return vabdl_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vabal_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
+//
 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vabal_high_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+//
 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vabal_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+//
 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vabal_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
-// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
+//
 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vabal_high_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
-// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+//
 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vabal_high_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vabal_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
-// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
-// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64>
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+//
 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vabal_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmull_s8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i32> [[VMULL2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
+//
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i64> [[VMULL2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
+//
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   ret <8 x i16> [[VMULL_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I_I]]
+//
 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
   return vmull_high_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
+//
 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
   return vmull_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
+//
 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
   return vmull_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   ret <8 x i16> [[VMULL_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I_I]]
+//
 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
   return vmull_high_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
+//
 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
   return vmull_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
+//
 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
   return vmull_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmlal_s8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
+//
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
+//
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
+//
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vmlal_high_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+//
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vmlal_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+//
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vmlal_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
-// CHECK:   ret <8 x i16> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
+//
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vmlal_high_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <4 x i32> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
+//
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vmlal_high_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlal_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <2 x i64> [[ADD_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
+//
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vmlal_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u8(
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_s8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I_I]]
+//
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
   return vmlsl_high_s8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
+//
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vmlsl_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
+//
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vmlsl_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_u8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
-// CHECK:   ret <8 x i16> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I_I]]
+//
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsl_high_u8(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_u16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <4 x i32> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
+//
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsl_high_u16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmlsl_high_u32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
-// CHECK:   ret <2 x i64> [[SUB_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
+//
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsl_high_u32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmull_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
+//
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
+//
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmlal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
+//
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
+//
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
-// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
+//
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
-// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
-// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
+//
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmull_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQDMULL_V2_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_high_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
+//
 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
   return vqdmull_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmull_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQDMULL_V2_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_high_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
+//
 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
   return vqdmull_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmlal_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]])
-// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
+//
 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vqdmlal_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlal_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]])
-// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
+//
 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vqdmlal_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_high_s16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]])
-// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_high_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
+//
 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
   return vqdmlsl_high_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsl_high_s32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
-// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]])
-// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]])
-// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_high_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
+// CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
+//
 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
   return vqdmlsl_high_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmull_p8(
-// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i16> [[VMULL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I]]
+//
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vmull_high_p8(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]])
-// CHECK:   ret <8 x i16> [[VMULL_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I5:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I5]], <8 x i8> [[SHUFFLE_I]])
+// CHECK-NEXT:    ret <8 x i16> [[VMULL_I_I]]
+//
 poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
   return vmull_high_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddd_s64(
-// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
-// CHECK:   ret i64 [[VADDD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDD_I:%.*]] = add i64 [[A]], [[B]]
+// CHECK-NEXT:    ret i64 [[VADDD_I]]
+//
 int64_t test_vaddd_s64(int64_t a, int64_t b) {
   return vaddd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddd_u64(
-// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
-// CHECK:   ret i64 [[VADDD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDD_I:%.*]] = add i64 [[A]], [[B]]
+// CHECK-NEXT:    ret i64 [[VADDD_I]]
+//
 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
   return vaddd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubd_s64(
-// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
-// CHECK:   ret i64 [[VSUBD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vsubd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]]
+// CHECK-NEXT:    ret i64 [[VSUBD_I]]
+//
 int64_t test_vsubd_s64(int64_t a, int64_t b) {
   return vsubd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsubd_u64(
-// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
-// CHECK:   ret i64 [[VSUBD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vsubd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]]
+// CHECK-NEXT:    ret i64 [[VSUBD_I]]
+//
 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
   return vsubd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqaddb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 int8_t test_vqaddb_s8(int8_t a, int8_t b) {
   return vqaddb_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqaddh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqaddh_s16(int16_t a, int16_t b) {
   return vqaddh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadds_s32(
-// CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQADDS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqadds_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQADDS_S32_I]]
+//
 int32_t test_vqadds_s32(int32_t a, int32_t b) {
   return vqadds_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddd_s64(
-// CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQADDD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqaddd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQADDD_S64_I]]
+//
 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
   return vqaddd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddb_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqaddb_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
   return vqaddb_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqaddh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
   return vqaddh_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqadds_u32(
-// CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQADDS_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqadds_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQADDS_U32_I]]
+//
 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
   return vqadds_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqaddd_u64(
-// CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQADDD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqaddd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQADDD_U64_I]]
+//
 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
   return vqaddd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqsubb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 int8_t test_vqsubb_s8(int8_t a, int8_t b) {
   return vqsubb_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqsubh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqsubh_s16(int16_t a, int16_t b) {
   return vqsubh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubs_s32(
-// CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQSUBS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqsubs_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQSUBS_S32_I]]
+//
 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
   return vqsubs_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubd_s64(
-// CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQSUBD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqsubd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQSUBD_S64_I]]
+//
 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
   return vqsubd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubb_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqsubb_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
   return vqsubb_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqsubh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
   return vqsubh_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubs_u32(
-// CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQSUBS_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqsubs_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQSUBS_U32_I]]
+//
 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
   return vqsubs_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqsubd_u64(
-// CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQSUBD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqsubd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQSUBD_U64_I]]
+//
 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
   return vqsubd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshld_s64(
-// CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VSHLD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vshld_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VSHLD_S64_I]]
+//
 int64_t test_vshld_s64(int64_t a, int64_t b) {
   return vshld_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vshld_u64(
-// CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VSHLD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vshld_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VSHLD_U64_I]]
+//
 uint64_t test_vshld_u64(uint64_t a, int64_t b) {
   return vshld_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqshlb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   return vqshlb_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqshlh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   return vqshlh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshls_s32(
-// CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQSHLS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqshls_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQSHLS_S32_I]]
+//
 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   return vqshls_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshld_s64(
-// CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQSHLD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqshld_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQSHLD_S64_I]]
+//
 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   return vqshld_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlb_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqshlb_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 uint8_t test_vqshlb_u8(uint8_t a, int8_t b) {
   return vqshlb_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqshlh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqshlh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 uint16_t test_vqshlh_u16(uint16_t a, int16_t b) {
   return vqshlh_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqshls_u32(
-// CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQSHLS_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqshls_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQSHLS_U32_I]]
+//
 uint32_t test_vqshls_u32(uint32_t a, int32_t b) {
   return vqshls_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqshld_u64(
-// CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQSHLD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqshld_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQSHLD_U64_I]]
+//
 uint64_t test_vqshld_u64(uint64_t a, int64_t b) {
   return vqshld_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshld_s64(
-// CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VRSHLD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vrshld_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VRSHLD_S64_I]]
+//
 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   return vrshld_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vrshld_u64(
-// CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VRSHLD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vrshld_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VRSHLD_U64_I]]
+//
 uint64_t test_vrshld_u64(uint64_t a, int64_t b) {
   return vrshld_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqrshlb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   return vqrshlb_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqrshlh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   return vqrshlh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshls_s32(
-// CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQRSHLS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqrshls_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQRSHLS_S32_I]]
+//
 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   return vqrshls_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshld_s64(
-// CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQRSHLD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqrshld_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQRSHLD_S64_I]]
+//
 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   return vqrshld_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlb_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vqrshlb_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 uint8_t test_vqrshlb_u8(uint8_t a, int8_t b) {
   return vqrshlb_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshlh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqrshlh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 uint16_t test_vqrshlh_u16(uint16_t a, int16_t b) {
   return vqrshlh_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshls_u32(
-// CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQRSHLS_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqrshls_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQRSHLS_U32_I]]
+//
 uint32_t test_vqrshls_u32(uint32_t a, int32_t b) {
   return vqrshls_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrshld_u64(
-// CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VQRSHLD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqrshld_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VQRSHLD_U64_I]]
+//
 uint64_t test_vqrshld_u64(uint64_t a, int64_t b) {
   return vqrshld_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddd_s64(
-// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a)
-// CHECK:   ret i64 [[VPADDD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vpaddd_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret i64 [[VPADDD_S64_I]]
+//
 int64_t test_vpaddd_s64(int64x2_t a) {
   return vpaddd_s64(a);
 }
 
-// CHECK-LABEL: @test_vpadds_f32(
-// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0
-// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1
-// CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
-// CHECK:   ret float [[VPADDD_I]]
+// CHECK-LABEL: define dso_local float @test_vpadds_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0
+// CHECK-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1
+// CHECK-NEXT:    [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
+// CHECK-NEXT:    ret float [[VPADDD_I]]
+//
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
 }
 
-// CHECK-LABEL: @test_vpaddd_f64(
-// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0
-// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1
-// CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
-// CHECK:   ret double [[VPADDD_I]]
+// CHECK-LABEL: define dso_local double @test_vpaddd_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0
+// CHECK-NEXT:    [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1
+// CHECK-NEXT:    [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
+// CHECK-NEXT:    ret double [[VPADDD_I]]
+//
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
 }
 
-// CHECK-LABEL: @test_vpmaxnms_f32(
-// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VPMAXNMS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vpmaxnms_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VPMAXNMS_F32_I]]
+//
 float32_t test_vpmaxnms_f32(float32x2_t a) {
   return vpmaxnms_f32(a);
 }
 
-// CHECK-LABEL: @test_vpmaxnmqd_f64(
-// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VPMAXNMQD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vpmaxnmqd_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VPMAXNMQD_F64_I]]
+//
 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   return vpmaxnmqd_f64(a);
 }
 
-// CHECK-LABEL: @test_vpmaxs_f32(
-// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VPMAXS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vpmaxs_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VPMAXS_F32_I]]
+//
 float32_t test_vpmaxs_f32(float32x2_t a) {
   return vpmaxs_f32(a);
 }
 
-// CHECK-LABEL: @test_vpmaxqd_f64(
-// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VPMAXQD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vpmaxqd_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VPMAXQD_F64_I]]
+//
 float64_t test_vpmaxqd_f64(float64x2_t a) {
   return vpmaxqd_f64(a);
 }
 
-// CHECK-LABEL: @test_vpminnms_f32(
-// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VPMINNMS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vpminnms_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VPMINNMS_F32_I]]
+//
 float32_t test_vpminnms_f32(float32x2_t a) {
   return vpminnms_f32(a);
 }
 
-// CHECK-LABEL: @test_vpminnmqd_f64(
-// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VPMINNMQD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vpminnmqd_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VPMINNMQD_F64_I]]
+//
 float64_t test_vpminnmqd_f64(float64x2_t a) {
   return vpminnmqd_f64(a);
 }
 
-// CHECK-LABEL: @test_vpmins_f32(
-// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VPMINS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vpmins_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VPMINS_F32_I]]
+//
 float32_t test_vpmins_f32(float32x2_t a) {
   return vpmins_f32(a);
 }
 
-// CHECK-LABEL: @test_vpminqd_f64(
-// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VPMINQD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vpminqd_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VPMINQD_F64_I]]
+//
 float64_t test_vpminqd_f64(float64x2_t a) {
   return vpminqd_f64(a);
 }
 
-// CHECK-LABEL: @test_vqdmulhh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
   return vqdmulhh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulhs_s32(
-// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQDMULHS_S32_I]]
+//
 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
   return vqdmulhs_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
   return vqrdmulhh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqrdmulhs_s32(
-// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VQRDMULHS_S32_I]]
+//
 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
   return vqrdmulhs_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulxs_f32(
-// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b)
-// CHECK:   ret float [[VMULXS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vmulxs_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret float [[VMULXS_F32_I]]
+//
 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
   return vmulxs_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vmulxd_f64(
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b)
-// CHECK:   ret double [[VMULXD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vmulxd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret double [[VMULXD_F64_I]]
+//
 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
   return vmulxd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmulx_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VMULX2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VMULX2_I]]
+//
 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
   return vmulx_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vrecpss_f32(
-// CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b)
-// CHECK:   ret float [[VRECPS_I]]
+// CHECK-LABEL: define dso_local float @test_vrecpss_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret float [[VRECPS_I]]
+//
 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
   return vrecpss_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vrecpsd_f64(
-// CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b)
-// CHECK:   ret double [[VRECPS_I]]
+// CHECK-LABEL: define dso_local double @test_vrecpsd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret double [[VRECPS_I]]
+//
 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
   return vrecpsd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vrsqrtss_f32(
-// CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b)
-// CHECK:   ret float [[VRSQRTSS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vrsqrtss_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret float [[VRSQRTSS_F32_I]]
+//
 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
   return vrsqrtss_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vrsqrtsd_f64(
-// CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b)
-// CHECK:   ret double [[VRSQRTSD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vrsqrtsd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret double [[VRSQRTSD_F64_I]]
+//
 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
   return vrsqrtsd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcvts_f32_s32(
-// CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
-// CHECK:   ret float [[TMP0]]
+// CHECK-LABEL: define dso_local float @test_vcvts_f32_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sitofp i32 [[A]] to float
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vcvts_f32_s32(int32_t a) {
   return vcvts_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vcvtd_f64_s64(
-// CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
-// CHECK:   ret double [[TMP0]]
+// CHECK-LABEL: define dso_local double @test_vcvtd_f64_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = sitofp i64 [[A]] to double
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vcvtd_f64_s64(int64_t a) {
   return vcvtd_f64_s64(a);
 }
 
-// CHECK-LABEL: @test_vcvts_f32_u32(
-// CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
-// CHECK:   ret float [[TMP0]]
+// CHECK-LABEL: define dso_local float @test_vcvts_f32_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = uitofp i32 [[A]] to float
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vcvts_f32_u32(uint32_t a) {
   return vcvts_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvtd_f64_u64(
-// CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
-// CHECK:   ret double [[TMP0]]
+// CHECK-LABEL: define dso_local double @test_vcvtd_f64_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = uitofp i64 [[A]] to double
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vcvtd_f64_u64(uint64_t a) {
   return vcvtd_f64_u64(a);
 }
 
-// CHECK-LABEL: @test_vrecpes_f32(
-// CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a)
-// CHECK:   ret float [[VRECPES_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vrecpes_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float [[A]])
+// CHECK-NEXT:    ret float [[VRECPES_F32_I]]
+//
 float32_t test_vrecpes_f32(float32_t a) {
   return vrecpes_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecped_f64(
-// CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a)
-// CHECK:   ret double [[VRECPED_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vrecped_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double [[A]])
+// CHECK-NEXT:    ret double [[VRECPED_F64_I]]
+//
 float64_t test_vrecped_f64(float64_t a) {
   return vrecped_f64(a);
 }
 
-// CHECK-LABEL: @test_vrecpxs_f32(
-// CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a)
-// CHECK:   ret float [[VRECPXS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vrecpxs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float [[A]])
+// CHECK-NEXT:    ret float [[VRECPXS_F32_I]]
+//
 float32_t test_vrecpxs_f32(float32_t a) {
   return vrecpxs_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecpxd_f64(
-// CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a)
-// CHECK:   ret double [[VRECPXD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vrecpxd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double [[A]])
+// CHECK-NEXT:    ret double [[VRECPXD_F64_I]]
+//
 float64_t test_vrecpxd_f64(float64_t a) {
   return vrecpxd_f64(a);
 }
 
-// CHECK-LABEL: @test_vrsqrte_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrsqrte_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VRSQRTE_V1_I]]
+//
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrteq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrsqrteq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VRSQRTEQ_V1_I]]
+//
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrtes_f32(
-// CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a)
-// CHECK:   ret float [[VRSQRTES_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vrsqrtes_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float [[A]])
+// CHECK-NEXT:    ret float [[VRSQRTES_F32_I]]
+//
 float32_t test_vrsqrtes_f32(float32_t a) {
   return vrsqrtes_f32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrted_f64(
-// CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a)
-// CHECK:   ret double [[VRSQRTED_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vrsqrted_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double [[A]])
+// CHECK-NEXT:    ret double [[VRSQRTED_F64_I]]
+//
 float64_t test_vrsqrted_f64(float64_t a) {
   return vrsqrted_f64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u8(
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1
-// CHECK:   ret <16 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u16(
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u32(
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_u64(
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s8(
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1
-// CHECK:   ret <16 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vld1q_s8(int8_t const *a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s16(
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vld1q_s16(int16_t const *a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s32(
-// CHECK:   [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vld1q_s32(int32_t const *a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_s64(
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vld1q_s64(int64_t const *a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f16(
-// CHECK:   [[TMP2:%.*]] = load <8 x half>, ptr %a, align 2
-// CHECK:   ret <8 x half> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vld1q_f16(float16_t const *a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f32(
-// CHECK:   [[TMP2:%.*]] = load <4 x float>, ptr %a, align 4
-// CHECK:   ret <4 x float> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vld1q_f32(float32_t const *a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f64(
-// CHECK:   [[TMP2:%.*]] = load <2 x double>, ptr %a, align 8
-// CHECK:   ret <2 x double> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vld1q_f64(float64_t const *a) {
   return vld1q_f64(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p8(
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1
-// CHECK:   ret <16 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p16(
-// CHECK:   [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2
-// CHECK:   ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vld1_u8(uint8_t const *a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1_u16(
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vld1_u16(uint16_t const *a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u32(
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vld1_u32(uint32_t const *a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1_u64(
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vld1_u64(uint64_t const *a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1_s8(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vld1_s8(int8_t const *a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1_s16(
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vld1_s16(int16_t const *a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1_s32(
-// CHECK:   [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vld1_s32(int32_t const *a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1_s64(
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vld1_s64(int64_t const *a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1_f16(
-// CHECK:   [[TMP2:%.*]] = load <4 x half>, ptr %a, align 2
-// CHECK:   ret <4 x half> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vld1_f16(float16_t const *a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1_f32(
-// CHECK:   [[TMP2:%.*]] = load <2 x float>, ptr %a, align 4
-// CHECK:   ret <2 x float> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vld1_f32(float32_t const *a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1_f64(
-// CHECK:   [[TMP2:%.*]] = load <1 x double>, ptr %a, align 8
-// CHECK:   ret <1 x double> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 8
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vld1_f64(float64_t const *a) {
   return vld1_f64(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vld1_p8(poly8_t const *a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1_p16(
-// CHECK:   [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2
-// CHECK:   ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vld1_p16(poly16_t const *a) {
   return vld1_p16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u8_void(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vld1_u8_void(void *a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: @test_vld1_u16_void(
-// CHECK:   [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vld1_u16_void(void *a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: @test_vld1_u32_void(
-// CHECK:   [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vld1_u32_void(void *a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: @test_vld1_u64_void(
-// CHECK:   [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vld1_u64_void(void *a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: @test_vld1_s8_void(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vld1_s8_void(void *a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: @test_vld1_s16_void(
-// CHECK:   [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vld1_s16_void(void *a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: @test_vld1_s32_void(
-// CHECK:   [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vld1_s32_void(void *a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: @test_vld1_s64_void(
-// CHECK:   [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vld1_s64_void(void *a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: @test_vld1_f16_void(
-// CHECK:   [[TMP1:%.*]] = load <4 x half>, ptr %a, align 1
-// CHECK:   ret <4 x half> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vld1_f16_void(void *a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: @test_vld1_f32_void(
-// CHECK:   [[TMP1:%.*]] = load <2 x float>, ptr %a, align 1
-// CHECK:   ret <2 x float> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vld1_f32_void(void *a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: @test_vld1_f64_void(
-// CHECK:   [[TMP1:%.*]] = load <1 x double>, ptr %a, align 1
-// CHECK:   ret <1 x double> [[TMP1]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vld1_f64_void(void *a) {
   return vld1_f64(a);
 }
 
-// CHECK-LABEL: @test_vld1_p8_void(
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1
-// CHECK:   ret <8 x i8> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vld1_p8_void(void *a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: @test_vld1_p16_void(
-// CHECK:   [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16_void(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vld1_p16_void(void *a) {
   return vld1_p16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP1]]
+//
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP1]]
+//
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP1]]
+//
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP1]]
+//
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
   return vld2q_u64(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP1]]
+//
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP1]]
+//
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP1]]
+//
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP1]]
+//
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
   return vld2q_s64(a);
 }
 
-// CHECK-LABEL: @test_vld2q_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr %a)
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, [2 x <8 x half>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]]
+//
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld2q_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %a)
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, [2 x <4 x float>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]]
+//
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld2q_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]]
+//
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
   return vld2q_f64(a);
 }
 
-// CHECK-LABEL: @test_vld2q_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP1]]
+//
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld2q_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP1]]
+//
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld2_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP1]]
+//
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: @test_vld2_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP1]]
+//
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: @test_vld2_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP1]]
+//
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: @test_vld2_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X2_T]] [[TMP1]]
+//
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: @test_vld2_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP1]]
+//
 int8x8x2_t test_vld2_s8(int8_t const *a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: @test_vld2_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP1]]
+//
 int16x4x2_t test_vld2_s16(int16_t const *a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: @test_vld2_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP1]]
+//
 int32x2x2_t test_vld2_s32(int32_t const *a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: @test_vld2_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X2_T]] [[TMP1]]
+//
 int64x1x2_t test_vld2_s64(int64_t const *a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: @test_vld2_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr %a)
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, [2 x <4 x half>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]]
+//
 float16x4x2_t test_vld2_f16(float16_t const *a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: @test_vld2_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr %a)
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, [2 x <2 x float>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]]
+//
 float32x2x2_t test_vld2_f32(float32_t const *a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: @test_vld2_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]]
+//
 float64x1x2_t test_vld2_f64(float64_t const *a) {
   return vld2_f64(a);
 }
 
-// CHECK-LABEL: @test_vld2_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP1]]
+//
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: @test_vld2_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP1]]
+//
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
   return vld2_p16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X3_T]] [[TMP2]]
+//
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X3_T]] [[TMP2]]
+//
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X3_T]] [[TMP2]]
+//
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP2]]
+//
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
   return vld3q_u64(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X3_T]] [[TMP2]]
+//
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X3_T]] [[TMP2]]
+//
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X3_T]] [[TMP2]]
+//
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP2]]
+//
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
   return vld3q_s64(a);
 }
 
-// CHECK-LABEL: @test_vld3q_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr %a)
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x half>] [[TMP1]], <8 x half> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] poison, [3 x <8 x half>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X3_T]] [[TMP2]]
+//
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld3q_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr %a)
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x float>] [[TMP1]], <4 x float> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] poison, [3 x <4 x float>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X3_T]] [[TMP2]]
+//
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld3q_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]]
+//
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
   return vld3q_f64(a);
 }
 
-// CHECK-LABEL: @test_vld3q_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X3_T]] [[TMP2]]
+//
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld3q_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X3_T]] [[TMP2]]
+//
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld3_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X3_T]] [[TMP2]]
+//
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: @test_vld3_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X3_T]] [[TMP2]]
+//
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: @test_vld3_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X3_T]] [[TMP2]]
+//
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: @test_vld3_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X3_T]] [[TMP2]]
+//
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: @test_vld3_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X3_T]] [[TMP2]]
+//
 int8x8x3_t test_vld3_s8(int8_t const *a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: @test_vld3_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X3_T]] [[TMP2]]
+//
 int16x4x3_t test_vld3_s16(int16_t const *a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: @test_vld3_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X3_T]] [[TMP2]]
+//
 int32x2x3_t test_vld3_s32(int32_t const *a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: @test_vld3_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X3_T]] [[TMP2]]
+//
 int64x1x3_t test_vld3_s64(int64_t const *a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: @test_vld3_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr %a)
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x half>] [[TMP1]], <4 x half> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] poison, [3 x <4 x half>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X3_T]] [[TMP2]]
+//
 float16x4x3_t test_vld3_f16(float16_t const *a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: @test_vld3_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr %a)
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x float>] [[TMP1]], <2 x float> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] poison, [3 x <2 x float>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X3_T]] [[TMP2]]
+//
 float32x2x3_t test_vld3_f32(float32_t const *a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: @test_vld3_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]]
+//
 float64x1x3_t test_vld3_f64(float64_t const *a) {
   return vld3_f64(a);
 }
 
-// CHECK-LABEL: @test_vld3_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X3_T]] [[TMP2]]
+//
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: @test_vld3_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X3_T]] [[TMP2]]
+//
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
   return vld3_p16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X4_T]] [[TMP3]]
+//
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X4_T]] [[TMP3]]
+//
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X4_T]] [[TMP3]]
+//
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP3]]
+//
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
   return vld4q_u64(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X4_T]] [[TMP3]]
+//
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X4_T]] [[TMP3]]
+//
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X4_T]] [[TMP3]]
+//
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP3]]
+//
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
   return vld4q_s64(a);
 }
 
-// CHECK-LABEL: @test_vld4q_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr %a)
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x half> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x half>] [[TMP1]], <8 x half> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x half>] [[TMP2]], <8 x half> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] poison, [4 x <8 x half>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X4_T]] [[TMP3]]
+//
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: @test_vld4q_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr %a)
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x float> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x float>] [[TMP1]], <4 x float> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x float>] [[TMP2]], <4 x float> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] poison, [4 x <4 x float>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X4_T]] [[TMP3]]
+//
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: @test_vld4q_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]]
+//
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
   return vld4q_f64(a);
 }
 
-// CHECK-LABEL: @test_vld4q_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X4_T]] [[TMP3]]
+//
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: @test_vld4q_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X4_T]] [[TMP3]]
+//
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: @test_vld4_u8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X4_T]] [[TMP3]]
+//
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: @test_vld4_u16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X4_T]] [[TMP3]]
+//
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: @test_vld4_u32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X4_T]] [[TMP3]]
+//
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: @test_vld4_u64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X4_T]] [[TMP3]]
+//
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: @test_vld4_s8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X4_T]] [[TMP3]]
+//
 int8x8x4_t test_vld4_s8(int8_t const *a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: @test_vld4_s16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X4_T]] [[TMP3]]
+//
 int16x4x4_t test_vld4_s16(int16_t const *a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: @test_vld4_s32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X4_T]] [[TMP3]]
+//
 int32x2x4_t test_vld4_s32(int32_t const *a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: @test_vld4_s64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X4_T]] [[TMP3]]
+//
 int64x1x4_t test_vld4_s64(int64_t const *a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: @test_vld4_f16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr %a)
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x half> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x half>] [[TMP1]], <4 x half> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x half>] [[TMP2]], <4 x half> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] poison, [4 x <4 x half>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X4_T]] [[TMP3]]
+//
 float16x4x4_t test_vld4_f16(float16_t const *a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: @test_vld4_f32(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr %a)
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x float> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x float>] [[TMP1]], <2 x float> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x float>] [[TMP2]], <2 x float> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] poison, [4 x <2 x float>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X4_T]] [[TMP3]]
+//
 float32x2x4_t test_vld4_f32(float32_t const *a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: @test_vld4_f64(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]]
+//
 float64x1x4_t test_vld4_f64(float64_t const *a) {
   return vld4_f64(a);
 }
 
-// CHECK-LABEL: @test_vld4_p8(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X4_T]] [[TMP3]]
+//
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: @test_vld4_p16(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X4_T]] [[TMP3]]
+//
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
   return vld4_p16(a);
 }
 
-// CHECK-LABEL: @test_vst1q_u8(
-// CHECK:   store <16 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   store <8 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   store <4 x i32> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   store <2 x i64> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s8(
-// CHECK:   store <16 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s8(int8_t *a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   store <8 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s16(int16_t *a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   store <4 x i32> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i32> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s32(int32_t *a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   store <2 x i64> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x i64> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_s64(int64_t *a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   store <8 x half> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f16(float16_t *a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   store <4 x float> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f32(float32_t *a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f64(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   store <2 x double> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f64(float64_t *a, float64x2_t b) {
   vst1q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p8(
-// CHECK:   store <16 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <16 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   store <8 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u8(
-// CHECK:   store <8 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u8(uint8_t *a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   store <4 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u16(uint16_t *a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   store <2 x i32> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u32(uint32_t *a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_u64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   store <1 x i64> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_u64(uint64_t *a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s8(
-// CHECK:   store <8 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s8(int8_t *a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   store <4 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s16(int16_t *a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   store <2 x i32> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x i32> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s32(int32_t *a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_s64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   store <1 x i64> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <1 x i64> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_s64(int64_t *a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   store <4 x half> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x half> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f16(float16_t *a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f32(
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   store <2 x float> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x float> [[B]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f32(float32_t *a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f64(
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   store <1 x double> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <1 x double> [[B]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f64(float64_t *a, float64x1_t b) {
   vst1_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p8(
-// CHECK:   store <8 x i8> %b, ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i8> [[B]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p8(poly8_t *a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p16(
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   store <4 x i16> [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <4 x i16> [[B]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p16(poly16_t *a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
   vst2q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
   vst2q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
   vst2q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s8(int8_t *a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s16(int16_t *a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s32(int32_t *a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_s64(int64_t *a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_f32(float32_t *a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_f64(float64_t *a, float64x1x2_t b) {
   vst2_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst2_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
   vst3q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
   vst3q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
   vst3q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s8(int8_t *a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s16(int16_t *a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s32(int32_t *a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_s64(int64_t *a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_f32(float32_t *a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_f64(float64_t *a, float64x1x3_t b) {
   vst3_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst3_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
   vst4q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
   vst4q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
   vst4q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_u64(
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s8(int8_t *a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s16(int16_t *a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s32(int32_t *a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_s64(
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_s64(int64_t *a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_f32(float32_t *a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_f64(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_f64(float64_t *a, float64x1x4_t b) {
   vst4_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vst4_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vld1q_f64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld1q_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]]
+//
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
   return vld1q_f64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld1q_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP1]]
+//
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
   return vld1q_p64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1_f64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld1_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]]
+//
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
   return vld1_f64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1_p64_x2(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld1_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP1]]
+//
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
   return vld1_p64_x2(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld1q_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]]
+//
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
   return vld1q_f64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld1q_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP2]]
+//
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
   return vld1q_p64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1_f64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld1_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]]
+//
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
   return vld1_f64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1_p64_x3(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld1_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP2]]
+//
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
   return vld1_p64_x3(a);
 }
 
-// CHECK-LABEL: @test_vld1q_f64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld1q_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD1XN_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 3
+// CHECK-NEXT:    store <2 x double> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]]
+//
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
   return vld1q_f64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1q_p64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld1q_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD1XN_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP3]]
+//
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
   return vld1q_p64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1_f64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld1_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD1XN_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 3
+// CHECK-NEXT:    store <1 x double> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]]
+//
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
   return vld1_f64_x4(a);
 }
 
-// CHECK-LABEL: @test_vld1_p64_x4(
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld1_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD1XN_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP3]]
+//
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
   return vld1_p64_x4(a);
 }
 
-// CHECK-LABEL: @test_vst1q_f64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
   vst1q_f64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
   vst1q_p64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
   vst1_f64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p64_x2(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
   vst1_p64_x2(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
   vst1q_f64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
   vst1q_p64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
   vst1_f64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p64_x3(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
   vst1_p64_x3(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_f64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
   vst1q_f64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1q_p64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
   vst1q_p64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_f64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_f64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
   vst1_f64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vst1_p64_x4(
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p64_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
   vst1_p64_x4(a, b);
 }
 
-// CHECK-LABEL: @test_vceqd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vceqd_s64(int64_t a, int64_t b) {
   return (uint64_t)vceqd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vceqd_u64(
-// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
   return (int64_t)vceqd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vceqzd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
-// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqzd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[A]], 0
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQZ_I]]
+//
 uint64_t test_vceqzd_s64(int64_t a) {
   return (uint64_t)vceqzd_s64(a);
 }
 
-// CHECK-LABEL: @test_vceqzd_u64(
-// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
-// CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQZD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqzd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[A]], 0
+// CHECK-NEXT:    [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQZD_I]]
+//
 int64_t test_vceqzd_u64(int64_t a) {
   return (int64_t)vceqzd_u64(a);
 }
 
-// CHECK-LABEL: @test_vcged_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcged_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sge i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcged_s64(int64_t a, int64_t b) {
   return (uint64_t)vcged_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcged_u64(
-// CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcged_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp uge i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcged_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgezd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
-// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgezd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[A]], -1
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCGEZ_I]]
+//
 uint64_t test_vcgezd_s64(int64_t a) {
   return (uint64_t)vcgezd_s64(a);
 }
 
-// CHECK-LABEL: @test_vcgtd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgtd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcgtd_s64(int64_t a, int64_t b) {
   return (uint64_t)vcgtd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtd_u64(
-// CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgtd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcgtd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtzd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
-// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgtzd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[A]], 0
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCGTZ_I]]
+//
 uint64_t test_vcgtzd_s64(int64_t a) {
   return (uint64_t)vcgtzd_s64(a);
 }
 
-// CHECK-LABEL: @test_vcled_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcled_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sle i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcled_s64(int64_t a, int64_t b) {
   return (uint64_t)vcled_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcled_u64(
-// CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcled_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ule i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcled_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vclezd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
-// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vclezd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[A]], 1
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCLEZ_I]]
+//
 uint64_t test_vclezd_s64(int64_t a) {
   return (uint64_t)vclezd_s64(a);
 }
 
-// CHECK-LABEL: @test_vcltd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcltd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcltd_s64(int64_t a, int64_t b) {
   return (uint64_t)vcltd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcltd_u64(
-// CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
-// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcltd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[A]], [[B]]
+// CHECK-NEXT:    [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQD_I]]
+//
 uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vcltd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vcltzd_s64(
-// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
-// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcltzd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i64 [[A]], 63
+// CHECK-NEXT:    ret i64 [[A_LOBIT]]
+//
 uint64_t test_vcltzd_s64(int64_t a) {
   return (uint64_t)vcltzd_s64(a);
 }
 
-// CHECK-LABEL: @test_vtstd_s64(
-// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
-// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
-// CHECK:   ret i64 [[VTSTD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vtstd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK-NEXT:    [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK-NEXT:    ret i64 [[VTSTD_I]]
+//
 uint64_t test_vtstd_s64(int64_t a, int64_t b) {
   return (uint64_t)vtstd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vtstd_u64(
-// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
-// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
-// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
-// CHECK:   ret i64 [[VTSTD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vtstd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK-NEXT:    [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK-NEXT:    ret i64 [[VTSTD_I]]
+//
 uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vtstd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vabsd_s64(
-// CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a)
-// CHECK:   ret i64 [[VABSD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vabsd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 [[A]])
+// CHECK-NEXT:    ret i64 [[VABSD_S64_I]]
+//
 int64_t test_vabsd_s64(int64_t a) {
   return (int64_t)vabsd_s64(a);
 }
 
-// CHECK-LABEL: @test_vqabsb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqabsb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqabsb_s8(int8_t a) {
   return (int8_t)vqabsb_s8(a);
 }
 
-// CHECK-LABEL: @test_vqabsh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqabsh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqabsh_s16(int16_t a) {
   return (int16_t)vqabsh_s16(a);
 }
 
-// CHECK-LABEL: @test_vqabss_s32(
-// CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
-// CHECK:   ret i32 [[VQABSS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqabss_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 [[A]])
+// CHECK-NEXT:    ret i32 [[VQABSS_S32_I]]
+//
 int32_t test_vqabss_s32(int32_t a) {
   return (int32_t)vqabss_s32(a);
 }
 
-// CHECK-LABEL: @test_vqabsd_s64(
-// CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a)
-// CHECK:   ret i64 [[VQABSD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqabsd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 [[A]])
+// CHECK-NEXT:    ret i64 [[VQABSD_S64_I]]
+//
 int64_t test_vqabsd_s64(int64_t a) {
   return (int64_t)vqabsd_s64(a);
 }
 
-// CHECK-LABEL: @test_vnegd_s64(
-// CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
-// CHECK:   ret i64 [[VNEGD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vnegd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VNEGD_I:%.*]] = sub i64 0, [[A]]
+// CHECK-NEXT:    ret i64 [[VNEGD_I]]
+//
 int64_t test_vnegd_s64(int64_t a) {
   return (int64_t)vnegd_s64(a);
 }
 
-// CHECK-LABEL: @test_vqnegb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqnegb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqnegb_s8(int8_t a) {
   return (int8_t)vqnegb_s8(a);
 }
 
-// CHECK-LABEL: @test_vqnegh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqnegh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqnegh_s16(int16_t a) {
   return (int16_t)vqnegh_s16(a);
 }
 
-// CHECK-LABEL: @test_vqnegs_s32(
-// CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a)
-// CHECK:   ret i32 [[VQNEGS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqnegs_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 [[A]])
+// CHECK-NEXT:    ret i32 [[VQNEGS_S32_I]]
+//
 int32_t test_vqnegs_s32(int32_t a) {
   return (int32_t)vqnegs_s32(a);
 }
 
-// CHECK-LABEL: @test_vqnegd_s64(
-// CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a)
-// CHECK:   ret i64 [[VQNEGD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqnegd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 [[A]])
+// CHECK-NEXT:    ret i64 [[VQNEGD_S64_I]]
+//
 int64_t test_vqnegd_s64(int64_t a) {
   return (int64_t)vqnegd_s64(a);
 }
 
-// CHECK-LABEL: @test_vuqaddb_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vuqaddb_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 int8_t test_vuqaddb_s8(int8_t a, uint8_t b) {
   return (int8_t)vuqaddb_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vuqaddh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vuqaddh_s16(int16_t a, uint16_t b) {
   return (int16_t)vuqaddh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadds_s32(
-// CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VUQADDS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vuqadds_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VUQADDS_S32_I]]
+//
 int32_t test_vuqadds_s32(int32_t a, uint32_t b) {
   return (int32_t)vuqadds_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddd_s64(
-// CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VUQADDD_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vuqaddd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VUQADDD_S64_I]]
+//
 int64_t test_vuqaddd_s64(int64_t a, uint64_t b) {
   return (int64_t)vuqaddd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddb_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0
-// CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
-// CHECK:   ret i8 [[TMP2]]
+// CHECK-LABEL: define dso_local i8 @test_vsqaddb_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+// CHECK-NEXT:    [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 uint8_t test_vsqaddb_u8(uint8_t a, int8_t b) {
   return (uint8_t)vsqaddb_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define dso_local i16 @test_vsqaddh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 uint16_t test_vsqaddh_u16(uint16_t a, int16_t b) {
   return (uint16_t)vsqaddh_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsqadds_u32(
-// CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b)
-// CHECK:   ret i32 [[VSQADDS_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vsqadds_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i32 [[VSQADDS_U32_I]]
+//
 uint32_t test_vsqadds_u32(uint32_t a, int32_t b) {
   return (uint32_t)vsqadds_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddd_u64(
-// CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b)
-// CHECK:   ret i64 [[VSQADDD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vsqaddd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 [[A]], i64 [[B]])
+// CHECK-NEXT:    ret i64 [[VSQADDD_U64_I]]
+//
 uint64_t test_vsqaddd_u64(uint64_t a, int64_t b) {
   return (uint64_t)vsqaddd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmlalh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0
-// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
-// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]])
-// CHECK:   ret i32 [[VQDMLXL1_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0_I]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1_I]]
+//
 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
   return (int32_t)vqdmlalh_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlals_s32(
-// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c)
-// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]])
-// CHECK:   ret i64 [[VQDMLXL1_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlals_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]])
+// CHECK-NEXT:    [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL_I]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1_I]]
+//
 int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
   return (int64_t)vqdmlals_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlslh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0
-// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
-// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]])
-// CHECK:   ret i32 [[VQDMLXL1_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0_I]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1_I]]
+//
 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
   return (int32_t)vqdmlslh_s16(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmlsls_s32(
-// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c)
-// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]])
-// CHECK:   ret i64 [[VQDMLXL1_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]])
+// CHECK-NEXT:    [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL_I]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1_I]]
+//
 int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
   return (int64_t)vqdmlsls_s32(a, b, c);
 }
 
-// CHECK-LABEL: @test_vqdmullh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
-// CHECK:   [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
-// CHECK:   ret i32 [[TMP2]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmullh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
   return (int32_t)vqdmullh_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqdmulls_s32(
-// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b)
-// CHECK:   ret i64 [[VQDMULLS_S32_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmulls_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]])
+// CHECK-NEXT:    ret i64 [[VQDMULLS_S32_I]]
+//
 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
   return (int64_t)vqdmulls_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovunh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqmovunh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 uint8_t test_vqmovunh_s16(int16_t a) {
   return (uint8_t)vqmovunh_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovuns_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqmovuns_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 uint16_t test_vqmovuns_s32(int32_t a) {
   return (uint16_t)vqmovuns_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovund_s64(
-// CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a)
-// CHECK:   ret i32 [[VQMOVUND_S64_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqmovund_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 [[A]])
+// CHECK-NEXT:    ret i32 [[VQMOVUND_S64_I]]
+//
 uint32_t test_vqmovund_s64(int64_t a) {
   return (uint32_t)vqmovund_s64(a);
 }
 
-// CHECK-LABEL: @test_vqmovnh_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqmovnh_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqmovnh_s16(int16_t a) {
   return (int8_t)vqmovnh_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovns_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqmovns_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqmovns_s32(int32_t a) {
   return (int16_t)vqmovns_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovnd_s64(
-// CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a)
-// CHECK:   ret i32 [[VQMOVND_S64_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqmovnd_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 [[A]])
+// CHECK-NEXT:    ret i32 [[VQMOVND_S64_I]]
+//
 int32_t test_vqmovnd_s64(int64_t a) {
   return (int32_t)vqmovnd_s64(a);
 }
 
-// CHECK-LABEL: @test_vqmovnh_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqmovnh_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqmovnh_u16(int16_t a) {
   return (int8_t)vqmovnh_u16(a);
 }
 
-// CHECK-LABEL: @test_vqmovns_u32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]])
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqmovns_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqmovns_u32(int32_t a) {
   return (int16_t)vqmovns_u32(a);
 }
 
-// CHECK-LABEL: @test_vqmovnd_u64(
-// CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a)
-// CHECK:   ret i32 [[VQMOVND_U64_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqmovnd_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 [[A]])
+// CHECK-NEXT:    ret i32 [[VQMOVND_U64_I]]
+//
 int32_t test_vqmovnd_u64(int64_t a) {
   return (int32_t)vqmovnd_u64(a);
 }
 
-// CHECK-LABEL: @test_vceqs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i32 @test_vceqs_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
   return (uint32_t)vceqs_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vceqd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
   return (uint64_t)vceqd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vceqzs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
-// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local i32 @test_vceqzs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCEQZ_I]]
+//
 uint32_t test_vceqzs_f32(float32_t a) {
   return (uint32_t)vceqzs_f32(a);
 }
 
-// CHECK-LABEL: @test_vceqzd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
-// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vceqzd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCEQZ_I]]
+//
 uint64_t test_vceqzd_f64(float64_t a) {
   return (uint64_t)vceqzd_f64(a);
 }
 
-// CHECK-LABEL: @test_vcges_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcges_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge float [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcges_f32(float32_t a, float32_t b) {
   return (uint32_t)vcges_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcged_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcged_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge double [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcged_f64(float64_t a, float64_t b) {
   return (uint64_t)vcged_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgezs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
-// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcgezs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCGEZ_I]]
+//
 uint32_t test_vcgezs_f32(float32_t a) {
   return (uint32_t)vcgezs_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgezd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
-// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgezd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCGEZ_I]]
+//
 uint64_t test_vcgezd_f64(float64_t a) {
   return (uint64_t)vcgezd_f64(a);
 }
 
-// CHECK-LABEL: @test_vcgts_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcgts_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcgts_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgtd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcgtd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcgtzs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
-// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcgtzs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCGTZ_I]]
+//
 uint32_t test_vcgtzs_f32(float32_t a) {
   return (uint32_t)vcgtzs_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgtzd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
-// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcgtzd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCGTZ_I]]
+//
 uint64_t test_vcgtzd_f64(float64_t a) {
   return (uint64_t)vcgtzd_f64(a);
 }
 
-// CHECK-LABEL: @test_vcles_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcles_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole float [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vcles_f32(float32_t a, float32_t b) {
   return (uint32_t)vcles_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcled_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcled_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole double [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcled_f64(float64_t a, float64_t b) {
   return (uint64_t)vcled_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vclezs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
-// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local i32 @test_vclezs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCLEZ_I]]
+//
 uint32_t test_vclezs_f32(float32_t a) {
   return (uint32_t)vclezs_f32(a);
 }
 
-// CHECK-LABEL: @test_vclezd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
-// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vclezd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCLEZ_I]]
+//
 uint64_t test_vclezd_f64(float64_t a) {
   return (uint64_t)vclezd_f64(a);
 }
 
-// CHECK-LABEL: @test_vclts_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i32 @test_vclts_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt float [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCMPD_I]]
+//
 uint32_t test_vclts_f32(float32_t a, float32_t b) {
   return (uint32_t)vclts_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcltd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
-// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCMPD_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcltd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt double [[A]], [[B]]
+// CHECK-NEXT:    [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCMPD_I]]
+//
 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcltd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcltzs_f32(
-// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
-// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
-// CHECK:   ret i32 [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcltzs_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK-NEXT:    ret i32 [[VCLTZ_I]]
+//
 uint32_t test_vcltzs_f32(float32_t a) {
   return (uint32_t)vcltzs_f32(a);
 }
 
-// CHECK-LABEL: @test_vcltzd_f64(
-// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
-// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
-// CHECK:   ret i64 [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcltzd_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK-NEXT:    ret i64 [[VCLTZ_I]]
+//
 uint64_t test_vcltzd_f64(float64_t a) {
   return (uint64_t)vcltzd_f64(a);
 }
 
-// CHECK-LABEL: @test_vcages_f32(
-// CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b)
-// CHECK:   ret i32 [[VCAGES_F32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcages_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret i32 [[VCAGES_F32_I]]
+//
 uint32_t test_vcages_f32(float32_t a, float32_t b) {
   return (uint32_t)vcages_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcaged_f64(
-// CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b)
-// CHECK:   ret i64 [[VCAGED_F64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcaged_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret i64 [[VCAGED_F64_I]]
+//
 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaged_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcagts_f32(
-// CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b)
-// CHECK:   ret i32 [[VCAGTS_F32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcagts_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret i32 [[VCAGTS_F32_I]]
+//
 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcagts_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcagtd_f64(
-// CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b)
-// CHECK:   ret i64 [[VCAGTD_F64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcagtd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret i64 [[VCAGTD_F64_I]]
+//
 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcagtd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcales_f32(
-// CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a)
-// CHECK:   ret i32 [[VCALES_F32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcales_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[B]], float [[A]])
+// CHECK-NEXT:    ret i32 [[VCALES_F32_I]]
+//
 uint32_t test_vcales_f32(float32_t a, float32_t b) {
   return (uint32_t)vcales_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcaled_f64(
-// CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a)
-// CHECK:   ret i64 [[VCALED_F64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcaled_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[B]], double [[A]])
+// CHECK-NEXT:    ret i64 [[VCALED_F64_I]]
+//
 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaled_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcalts_f32(
-// CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a)
-// CHECK:   ret i32 [[VCALTS_F32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vcalts_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[B]], float [[A]])
+// CHECK-NEXT:    ret i32 [[VCALTS_F32_I]]
+//
 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
   return (uint32_t)vcalts_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcaltd_f64(
-// CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a)
-// CHECK:   ret i64 [[VCALTD_F64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vcaltd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[B]], double [[A]])
+// CHECK-NEXT:    ret i64 [[VCALTD_F64_I]]
+//
 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
   return (uint64_t)vcaltd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vshrd_n_s64(
-// CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
-// CHECK:   ret i64 [[SHRD_N]]
+// CHECK-LABEL: define dso_local i64 @test_vshrd_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHRD_N:%.*]] = ashr i64 [[A]], 1
+// CHECK-NEXT:    ret i64 [[SHRD_N]]
+//
 int64_t test_vshrd_n_s64(int64_t a) {
   return (int64_t)vshrd_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshr_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <1 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
+//
 int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshrd_n_u64(
-// CHECK:   ret i64 0
+// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i64 0
+//
 uint64_t test_vshrd_n_u64(uint64_t a) {
   return (uint64_t)vshrd_n_u64(a, 64);
 }
 
-// CHECK-LABEL: @test_vshrd_n_u64_2(
-// CHECK:   ret i64 0
+// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64_2(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i64 0
+//
 uint64_t test_vshrd_n_u64_2() {
   uint64_t a = UINT64_C(0xf000000000000000);
   return vshrd_n_u64(a, 64);
 }
 
-// CHECK-LABEL: @test_vshr_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHR_N]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <1 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
+//
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrd_n_s64(
-// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
-// CHECK:   ret i64 [[VRSHR_N]]
+// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 -63)
+// CHECK-NEXT:    ret i64 [[VRSHR_N]]
+//
 int64_t test_vrshrd_n_s64(int64_t a) {
   return (int64_t)vrshrd_n_s64(a, 63);
 }
 
-// CHECK-LABEL: @test_vrshr_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
+//
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vrshrd_n_u64(
-// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
-// CHECK:   ret i64 [[VRSHR_N]]
+// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 -63)
+// CHECK-NEXT:    ret i64 [[VRSHR_N]]
+//
 uint64_t test_vrshrd_n_u64(uint64_t a) {
   return (uint64_t)vrshrd_n_u64(a, 63);
 }
 
-// CHECK-LABEL: @test_vrshr_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   ret <1 x i64> [[VRSHR_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
+//
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vsrad_n_s64(
-// CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
-// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
-// CHECK:   ret i64 [[TMP0]]
+// CHECK-LABEL: define dso_local i64 @test_vsrad_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHRD_N:%.*]] = ashr i64 [[B]], 63
+// CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]]
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
   return (int64_t)vsrad_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsra_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <1 x i64> [[TMP4]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <1 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsrad_n_u64(
-// CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
-// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
-// CHECK:   ret i64 [[TMP0]]
+// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHRD_N:%.*]] = lshr i64 [[B]], 63
+// CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]]
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vsrad_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsrad_n_u64_2(
-// CHECK:   ret i64 %a
+// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64_2(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i64 [[A]]
+//
 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
   return (uint64_t)vsrad_n_u64(a, b, 64);
 }
 
-// CHECK-LABEL: @test_vsra_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK:   ret <1 x i64> [[TMP4]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <1 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsrad_n_s64(
-// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
-// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
-// CHECK:   ret i64 [[TMP1]]
+// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[B]], i64 -63)
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[A]], [[TMP0]]
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
   return (int64_t)vrsrad_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vrsra_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vrsrad_n_u64(
-// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
-// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
-// CHECK:   ret i64 [[TMP1]]
+// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[B]], i64 -63)
+// CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[A]], [[TMP0]]
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vrsrad_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vrsra_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
-// CHECK:   ret <1 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VRSHR_N1]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vshld_n_s64(
-// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
-// CHECK:   ret i64 [[SHLD_N]]
+// CHECK-LABEL: define dso_local i64 @test_vshld_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHLD_N:%.*]] = shl i64 [[A]], 1
+// CHECK-NEXT:    ret i64 [[SHLD_N]]
+//
 int64_t test_vshld_n_s64(int64_t a) {
   return (int64_t)vshld_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshl_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
+//
 int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vshld_n_u64(
-// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
-// CHECK:   ret i64 [[SHLD_N]]
+// CHECK-LABEL: define dso_local i64 @test_vshld_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHLD_N:%.*]] = shl i64 [[A]], 63
+// CHECK-NEXT:    ret i64 [[SHLD_N]]
+//
 uint64_t test_vshld_n_u64(uint64_t a) {
   return (uint64_t)vshld_n_u64(a, 63);
 }
 
-// CHECK-LABEL: @test_vshl_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
-// CHECK:   ret <1 x i64> [[VSHL_N]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
+//
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlb_n_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqshlb_n_s8(int8_t a) {
   return (int8_t)vqshlb_n_s8(a, 7);
 }
 
-// CHECK-LABEL: @test_vqshlh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqshlh_n_s16(int16_t a) {
   return (int16_t)vqshlh_n_s16(a, 15);
 }
 
-// CHECK-LABEL: @test_vqshls_n_s32(
-// CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
-// CHECK:   ret i32 [[VQSHLS_N_S32]]
+// CHECK-LABEL: define dso_local i32 @test_vqshls_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 31)
+// CHECK-NEXT:    ret i32 [[VQSHLS_N_S32]]
+//
 int32_t test_vqshls_n_s32(int32_t a) {
   return (int32_t)vqshls_n_s32(a, 31);
 }
 
-// CHECK-LABEL: @test_vqshld_n_s64(
-// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
-// CHECK:   ret i64 [[VQSHL_N]]
+// CHECK-LABEL: define dso_local i64 @test_vqshld_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 63)
+// CHECK-NEXT:    ret i64 [[VQSHL_N]]
+//
 int64_t test_vqshld_n_s64(int64_t a) {
   return (int64_t)vqshld_n_s64(a, 63);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s8(
-// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-// CHECK:   ret <8 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_N]]
+//
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s8(
-// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-// CHECK:   ret <16 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[VQSHL_N]]
+//
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
-// CHECK:   ret <4 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[A]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
+//
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
-// CHECK:   ret <8 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[A]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
+//
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
-// CHECK:   ret <2 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[A]], <2 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
+//
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
-// CHECK:   ret <4 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[A]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
+//
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
-// CHECK:   ret <2 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[A]], <2 x i64> zeroinitializer)
+// CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
+//
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u8(
-// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-// CHECK:   ret <8 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i8> [[VQSHL_N]]
+//
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u8(
-// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-// CHECK:   ret <16 x i8> [[VQSHL_N]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[VQSHL_N]]
+//
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
-// CHECK:   ret <4 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[A]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
+//
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
-// CHECK:   ret <8 x i16> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[A]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
+//
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
-// CHECK:   ret <2 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[A]], <2 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
+//
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
-// CHECK:   ret <4 x i32> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[A]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
+//
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshlq_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
-// CHECK:   ret <2 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[A]], <2 x i64> zeroinitializer)
+// CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
+//
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 0);
 }
 
-// CHECK-LABEL: @test_vqshl_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
+//
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlb_n_u8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_u8(
+// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 uint8_t test_vqshlb_n_u8(uint8_t a) {
   return (uint8_t)vqshlb_n_u8(a, 7);
 }
 
-// CHECK-LABEL: @test_vqshlh_n_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 uint16_t test_vqshlh_n_u16(uint16_t a) {
   return (uint16_t)vqshlh_n_u16(a, 15);
 }
 
-// CHECK-LABEL: @test_vqshls_n_u32(
-// CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
-// CHECK:   ret i32 [[VQSHLS_N_U32]]
+// CHECK-LABEL: define dso_local i32 @test_vqshls_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 31)
+// CHECK-NEXT:    ret i32 [[VQSHLS_N_U32]]
+//
 uint32_t test_vqshls_n_u32(uint32_t a) {
   return (uint32_t)vqshls_n_u32(a, 31);
 }
 
-// CHECK-LABEL: @test_vqshld_n_u64(
-// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
-// CHECK:   ret i64 [[VQSHL_N]]
+// CHECK-LABEL: define dso_local i64 @test_vqshld_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 63)
+// CHECK-NEXT:    ret i64 [[VQSHL_N]]
+//
 uint64_t test_vqshld_n_u64(uint64_t a) {
   return (uint64_t)vqshld_n_u64(a, 63);
 }
 
-// CHECK-LABEL: @test_vqshl_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHL_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
+//
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: @test_vqshlub_n_s8(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0
-// CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshlub_n_s8(
+// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqshlub_n_s8(int8_t a) {
   return (int8_t)vqshlub_n_s8(a, 7);
 }
 
-// CHECK-LABEL: @test_vqshluh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshluh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 poison, i16 poison, i16 poison>)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqshluh_n_s16(int16_t a) {
   return (int16_t)vqshluh_n_s16(a, 15);
 }
 
-// CHECK-LABEL: @test_vqshlus_n_s32(
-// CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
-// CHECK:   ret i32 [[VQSHLUS_N_S32]]
+// CHECK-LABEL: define dso_local i32 @test_vqshlus_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[A]], i32 31)
+// CHECK-NEXT:    ret i32 [[VQSHLUS_N_S32]]
+//
 int32_t test_vqshlus_n_s32(int32_t a) {
   return (int32_t)vqshlus_n_s32(a, 31);
 }
 
-// CHECK-LABEL: @test_vqshlud_n_s64(
-// CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
-// CHECK:   ret i64 [[VQSHLU_N]]
+// CHECK-LABEL: define dso_local i64 @test_vqshlud_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[A]], i64 63)
+// CHECK-NEXT:    ret i64 [[VQSHLU_N]]
+//
 int64_t test_vqshlud_n_s64(int64_t a) {
   return (int64_t)vqshlud_n_s64(a, 63);
 }
 
-// CHECK-LABEL: @test_vqshlu_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1))
-// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqshlu_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    ret <1 x i64> [[VQSHLU_N1]]
+//
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vsrid_n_s64(
-// CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
-// CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
-// CHECK:   [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64
-// CHECK:   ret i64 [[VSRID_N_S643]]
+// CHECK-LABEL: define dso_local i64 @test_vsrid_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRID_N_S64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VSRID_N_S641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0
+// CHECK-NEXT:    [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[VSRID_N_S642]], i64 0
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
   return (int64_t)vsrid_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsri_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
-// CHECK:   ret <1 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSRI_N2]]
+//
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vsrid_n_u64(
-// CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
-// CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
-// CHECK:   [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64
-// CHECK:   ret i64 [[VSRID_N_U643]]
+// CHECK-LABEL: define dso_local i64 @test_vsrid_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRID_N_U64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VSRID_N_U641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0
+// CHECK-NEXT:    [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[VSRID_N_U642]], i64 0
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vsrid_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsri_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
-// CHECK:   ret <1 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSRI_N2]]
+//
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vslid_n_s64(
-// CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
-// CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
-// CHECK:   [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64
-// CHECK:   ret i64 [[VSLID_N_S643]]
+// CHECK-LABEL: define dso_local i64 @test_vslid_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLID_N_S64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VSLID_N_S641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0
+// CHECK-NEXT:    [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[VSLID_N_S642]], i64 0
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 int64_t test_vslid_n_s64(int64_t a, int64_t b) {
   return (int64_t)vslid_n_s64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsli_n_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vslid_n_u64(
-// CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
-// CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
-// CHECK:   [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64
-// CHECK:   ret i64 [[VSLID_N_U643]]
+// CHECK-LABEL: define dso_local i64 @test_vslid_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLID_N_U64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VSLID_N_U641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0
+// CHECK-NEXT:    [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[VSLID_N_U642]], i64 0
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
   return (uint64_t)vslid_n_u64(a, b, 63);
 }
 
-// CHECK-LABEL: @test_vsli_n_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
-// CHECK:   ret <1 x i64> [[VSLI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1)
+// CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
+//
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: @test_vqshrnh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqshrnh_n_s16(int16_t a) {
   return (int8_t)vqshrnh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqshrns_n_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqshrns_n_s32(int32_t a) {
   return (int16_t)vqshrns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqshrnd_n_s64(
-// CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQSHRND_N_S64]]
+// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQSHRND_N_S64]]
+//
 int32_t test_vqshrnd_n_s64(int64_t a) {
   return (int32_t)vqshrnd_n_s64(a, 32);
 }
 
-// CHECK-LABEL: @test_vqshrnh_n_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 uint8_t test_vqshrnh_n_u16(uint16_t a) {
   return (uint8_t)vqshrnh_n_u16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqshrns_n_u32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 uint16_t test_vqshrns_n_u32(uint32_t a) {
   return (uint16_t)vqshrns_n_u32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqshrnd_n_u64(
-// CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQSHRND_N_U64]]
+// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQSHRND_N_U64]]
+//
 uint32_t test_vqshrnd_n_u64(uint64_t a) {
   return (uint32_t)vqshrnd_n_u64(a, 32);
 }
 
-// CHECK-LABEL: @test_vqrshrnh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqrshrnh_n_s16(int16_t a) {
   return (int8_t)vqrshrnh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqrshrns_n_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqrshrns_n_s32(int32_t a) {
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqrshrnd_n_s64(
-// CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQRSHRND_N_S64]]
+// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQRSHRND_N_S64]]
+//
 int32_t test_vqrshrnd_n_s64(int64_t a) {
   return (int32_t)vqrshrnd_n_s64(a, 32);
 }
 
-// CHECK-LABEL: @test_vqrshrnh_n_u16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_u16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 uint8_t test_vqrshrnh_n_u16(uint16_t a) {
   return (uint8_t)vqrshrnh_n_u16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqrshrns_n_u32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 uint16_t test_vqrshrns_n_u32(uint32_t a) {
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqrshrnd_n_u64(
-// CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQRSHRND_N_U64]]
+// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQRSHRND_N_U64]]
+//
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
   return (uint32_t)vqrshrnd_n_u64(a, 32);
 }
 
-// CHECK-LABEL: @test_vqshrunh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqshrunh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 int8_t test_vqshrunh_n_s16(int16_t a) {
   return (int8_t)vqshrunh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqshruns_n_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqshruns_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 int16_t test_vqshruns_n_s32(int32_t a) {
   return (int16_t)vqshruns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqshrund_n_s64(
-// CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQSHRUND_N_S64]]
+// CHECK-LABEL: define dso_local i32 @test_vqshrund_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQSHRUND_N_S64]]
+//
 int32_t test_vqshrund_n_s64(int64_t a) {
   return (int32_t)vqshrund_n_s64(a, 32);
 }
 
-// CHECK-LABEL: @test_vqrshrunh_n_s16(
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0
-// CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
-// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
-// CHECK:   ret i8 [[TMP1]]
+// CHECK-LABEL: define dso_local i8 @test_vqrshrunh_n_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
+// CHECK-NEXT:    ret i8 [[TMP1]]
+//
 uint8_t test_vqrshrunh_n_s16(int16_t a) {
   return (uint8_t)vqrshrunh_n_s16(a, 8);
 }
 
-// CHECK-LABEL: @test_vqrshruns_n_s32(
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-// CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
-// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
-// CHECK:   ret i16 [[TMP1]]
+// CHECK-LABEL: define dso_local i16 @test_vqrshruns_n_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP1]]
+//
 uint16_t test_vqrshruns_n_s32(int32_t a) {
   return (uint16_t)vqrshruns_n_s32(a, 16);
 }
 
-// CHECK-LABEL: @test_vqrshrund_n_s64(
-// CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
-// CHECK:   ret i32 [[VQRSHRUND_N_S64]]
+// CHECK-LABEL: define dso_local i32 @test_vqrshrund_n_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VQRSHRUND_N_S64]]
+//
 uint32_t test_vqrshrund_n_s64(int64_t a) {
   return (uint32_t)vqrshrund_n_s64(a, 32);
 }
 
-// CHECK-LABEL: @test_vcvts_n_f32_s32(
-// CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
-// CHECK:   ret float [[VCVTS_N_F32_S32]]
+// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 [[A]], i32 1)
+// CHECK-NEXT:    ret float [[VCVTS_N_F32_S32]]
+//
 float32_t test_vcvts_n_f32_s32(int32_t a) {
   return vcvts_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvtd_n_f64_s64(
-// CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
-// CHECK:   ret double [[VCVTD_N_F64_S64]]
+// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_s64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 [[A]], i32 1)
+// CHECK-NEXT:    ret double [[VCVTD_N_F64_S64]]
+//
 float64_t test_vcvtd_n_f64_s64(int64_t a) {
   return vcvtd_n_f64_s64(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvts_n_f32_u32(
-// CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
-// CHECK:   ret float [[VCVTS_N_F32_U32]]
+// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_u32(
+// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 [[A]], i32 32)
+// CHECK-NEXT:    ret float [[VCVTS_N_F32_U32]]
+//
 float32_t test_vcvts_n_f32_u32(uint32_t a) {
   return vcvts_n_f32_u32(a, 32);
 }
 
-// CHECK-LABEL: @test_vcvtd_n_f64_u64(
-// CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
-// CHECK:   ret double [[VCVTD_N_F64_U64]]
+// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_u64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 [[A]], i32 64)
+// CHECK-NEXT:    ret double [[VCVTD_N_F64_U64]]
+//
 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
   return vcvtd_n_f64_u64(a, 64);
 }
 
-// CHECK-LABEL: @test_vcvts_n_s32_f32(
-// CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
-// CHECK:   ret i32 [[VCVTS_N_S32_F32]]
+// CHECK-LABEL: define dso_local i32 @test_vcvts_n_s32_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float [[A]], i32 1)
+// CHECK-NEXT:    ret i32 [[VCVTS_N_S32_F32]]
+//
 int32_t test_vcvts_n_s32_f32(float32_t a) {
   return (int32_t)vcvts_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvtd_n_s64_f64(
-// CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
-// CHECK:   ret i64 [[VCVTD_N_S64_F64]]
+// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_s64_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double [[A]], i32 1)
+// CHECK-NEXT:    ret i64 [[VCVTD_N_S64_F64]]
+//
 int64_t test_vcvtd_n_s64_f64(float64_t a) {
   return (int64_t)vcvtd_n_s64_f64(a, 1);
 }
 
-// CHECK-LABEL: @test_vcvts_n_u32_f32(
-// CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
-// CHECK:   ret i32 [[VCVTS_N_U32_F32]]
+// CHECK-LABEL: define dso_local i32 @test_vcvts_n_u32_f32(
+// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float [[A]], i32 32)
+// CHECK-NEXT:    ret i32 [[VCVTS_N_U32_F32]]
+//
 uint32_t test_vcvts_n_u32_f32(float32_t a) {
   return (uint32_t)vcvts_n_u32_f32(a, 32);
 }
 
-// CHECK-LABEL: @test_vcvtd_n_u64_f64(
-// CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
-// CHECK:   ret i64 [[VCVTD_N_U64_F64]]
+// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_u64_f64(
+// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double [[A]], i32 64)
+// CHECK-NEXT:    ret i64 [[VCVTD_N_U64_F64]]
+//
 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
   return (uint64_t)vcvtd_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_p8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_p16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u32(
-// CHECK:   ret <2 x i32> %a
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i32> [[A]]
+//
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_u64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_s64_p64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   return vreinterpret_s64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_p8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_p16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s32(
-// CHECK:   ret <2 x i32> %a
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i32> [[A]]
+//
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_s64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_u64_p64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   return vreinterpret_u64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
-// CHECK:   ret <4 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
-// CHECK:   ret <2 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_f64_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
-// CHECK:   ret <1 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u8(
-// CHECK:   ret <8 x i8> %a
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[A]]
+//
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u16(
-// CHECK:   ret <4 x i16> %a
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i16> [[A]]
+//
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_s64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   return vreinterpret_p64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_u64(
-// CHECK:   ret <1 x i64> %a
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   return vreinterpret_p64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpret_p64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_p8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_p16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u32(
-// CHECK:   ret <4 x i32> %a
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i32> [[A]]
+//
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_u64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_s64_p64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   return vreinterpretq_s64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_p8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_p16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s32(
-// CHECK:   ret <4 x i32> %a
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <4 x i32> [[A]]
+//
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_s64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_u64_p64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   return vreinterpretq_u64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
-// CHECK:   ret <8 x half> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f32_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_f64_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u8(
-// CHECK:   ret <16 x i8> %a
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[A]]
+//
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p8_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u16(
-// CHECK:   ret <8 x i16> %a
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i16> [[A]]
+//
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p16_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_s64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   return vreinterpretq_p64_s64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_u64(
-// CHECK:   ret <2 x i64> %a
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> [[A]]
+//
 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   return vreinterpretq_p64_u64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_p8(
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
 }
 
-// CHECK-LABEL: @test_vreinterpretq_p64_p16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
 }
 
-// CHECK-LABEL: @test_vabds_f32(
-// CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b)
-// CHECK:   ret float [[VABDS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vabds_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float [[A]], float [[B]])
+// CHECK-NEXT:    ret float [[VABDS_F32_I]]
+//
 float32_t test_vabds_f32(float32_t a, float32_t b) {
   return vabds_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vabdd_f64(
-// CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b)
-// CHECK:   ret double [[VABDD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vabdd_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double [[A]], double [[B]])
+// CHECK-NEXT:    ret double [[VABDD_F64_I]]
+//
 float64_t test_vabdd_f64(float64_t a, float64_t b) {
   return vabdd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s8(
-// CHECK: entry:
-// CHECK-NEXT:  [[V:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK-NEXT:  ret <16 x i8> [[V]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VUQADD_I]]
+//
 int8x16_t test_vuqaddq_s8(int8x16_t a, uint8x16_t b) {
   return vuqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s32(
-// CHECK: [[V:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK-NEXT:  ret <4 x i32> [[V]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VUQADD2_I]]
+//
 int32x4_t test_vuqaddq_s32(int32x4_t a, uint32x4_t b) {
   return vuqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s64(
-// CHECK: [[V:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK-NEXT:  ret <2 x i64> [[V]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VUQADD2_I]]
+//
 int64x2_t test_vuqaddq_s64(int64x2_t a, uint64x2_t b) {
   return vuqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s16(
-// CHECK: [[V:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK-NEXT:  ret <8 x i16> [[V]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VUQADD2_I]]
+//
 int16x8_t test_vuqaddq_s16(int16x8_t a, uint16x8_t b) {
   return vuqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s8(
-// CHECK: entry:
-// CHECK-NEXT: [[V:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK-NEXT: ret <8 x i8> [[V]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VUQADD_I]]
+//
 int8x8_t test_vuqadd_s8(int8x8_t a, uint8x8_t b) {
   return vuqadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s32(
-// CHECK: [[V:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK-NEXT:  ret <2 x i32> [[V]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VUQADD2_I]]
+//
 int32x2_t test_vuqadd_s32(int32x2_t a, uint32x2_t b) {
   return vuqadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   ret <1 x i64> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vuqadd_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VUQADD2_I]]
+//
 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
   return vuqadd_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s16(
-// CHECK: [[V:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK-NEXT:  ret <4 x i16> [[V]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VUQADD2_I]]
+//
 int16x4_t test_vuqadd_s16(int16x4_t a, uint16x4_t b) {
   return vuqadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vsqadd_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-// CHECK:   ret <1 x i64> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsqadd_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
+// CHECK-NEXT:    ret <1 x i64> [[VSQADD2_I]]
+//
 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
   return vsqadd_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vsqadd_u8(
-// CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VSQADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vsqadd_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VSQADD_I]]
+//
 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
   return vsqadd_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddq_u8(
-// CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VSQADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vsqaddq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VSQADD_I]]
+//
 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
   return vsqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vsqadd_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vsqadd_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VSQADD2_I]]
+//
 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
   return vsqadd_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vsqaddq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VSQADD2_I]]
+//
 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
   return vsqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vsqadd_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsqadd_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VSQADD2_I]]
+//
 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
   return vsqadd_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsqaddq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VSQADD2_I]]
+//
 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
   return vsqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vsqaddq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   ret <2 x i64> [[VSQADD2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsqaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VSQADD2_I]]
+//
 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
   return vsqaddq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vabs_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %a)
-// CHECK:   ret <1 x i64> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vabs_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VABS1_I]]
+//
 int64x1_t test_vabs_s64(int64x1_t a) {
   return vabs_s64(a);
 }
 
-// CHECK-LABEL: @test_vqabs_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> %a)
-// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQABS_V1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqabs_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VQABS_V1_I]]
+//
 int64x1_t test_vqabs_s64(int64x1_t a) {
   return vqabs_s64(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> %a)
-// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   ret <1 x i64> [[VQNEG_V1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vqneg_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VQNEG_V1_I]]
+//
 int64x1_t test_vqneg_s64(int64x1_t a) {
   return vqneg_s64(a);
 }
 
-// CHECK-LABEL: @test_vneg_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
-// CHECK:   ret <1 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vneg_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <1 x i64> [[SUB_I]]
+//
 int64x1_t test_vneg_s64(int64x1_t a) {
   return vneg_s64(a);
 }
 
-// CHECK-LABEL: @test_vaddv_f32(
-// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VADDV_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vaddv_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VADDV_F32_I]]
+//
 float32_t test_vaddv_f32(float32x2_t a) {
   return vaddv_f32(a);
 }
 
-// CHECK-LABEL: @test_vaddvq_f32(
-// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a)
-// CHECK:   ret float [[VADDVQ_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vaddvq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret float [[VADDVQ_F32_I]]
+//
 float32_t test_vaddvq_f32(float32x4_t a) {
   return vaddvq_f32(a);
 }
 
-// CHECK-LABEL: @test_vaddvq_f64(
-// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VADDVQ_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vaddvq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VADDVQ_F64_I]]
+//
 float64_t test_vaddvq_f64(float64x2_t a) {
   return vaddvq_f64(a);
 }
 
-// CHECK-LABEL: @test_vmaxv_f32(
-// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VMAXV_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vmaxv_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VMAXV_F32_I]]
+//
 float32_t test_vmaxv_f32(float32x2_t a) {
   return vmaxv_f32(a);
 }
 
-// CHECK-LABEL: @test_vmaxvq_f64(
-// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VMAXVQ_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vmaxvq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VMAXVQ_F64_I]]
+//
 float64_t test_vmaxvq_f64(float64x2_t a) {
   return vmaxvq_f64(a);
 }
 
-// CHECK-LABEL: @test_vminv_f32(
-// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VMINV_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vminv_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VMINV_F32_I]]
+//
 float32_t test_vminv_f32(float32x2_t a) {
   return vminv_f32(a);
 }
 
-// CHECK-LABEL: @test_vminvq_f64(
-// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VMINVQ_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vminvq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VMINVQ_F64_I]]
+//
 float64_t test_vminvq_f64(float64x2_t a) {
   return vminvq_f64(a);
 }
 
-// CHECK-LABEL: @test_vmaxnmvq_f64(
-// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VMAXNMVQ_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vmaxnmvq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VMAXNMVQ_F64_I]]
+//
 float64_t test_vmaxnmvq_f64(float64x2_t a) {
   return vmaxnmvq_f64(a);
 }
 
-// CHECK-LABEL: @test_vmaxnmv_f32(
-// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VMAXNMV_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vmaxnmv_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VMAXNMV_F32_I]]
+//
 float32_t test_vmaxnmv_f32(float32x2_t a) {
   return vmaxnmv_f32(a);
 }
 
-// CHECK-LABEL: @test_vminnmvq_f64(
-// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a)
-// CHECK:   ret double [[VMINNMVQ_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vminnmvq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret double [[VMINNMVQ_F64_I]]
+//
 float64_t test_vminnmvq_f64(float64x2_t a) {
   return vminnmvq_f64(a);
 }
 
-// CHECK-LABEL: @test_vminnmv_f32(
-// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a)
-// CHECK:   ret float [[VMINNMV_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vminnmv_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret float [[VMINNMV_F32_I]]
+//
 float32_t test_vminnmv_f32(float32x2_t a) {
   return vminnmv_f32(a);
 }
 
-// CHECK-LABEL: @test_vpaddq_s64(
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDQ_V2_I]]
+//
 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
   return vpaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddq_u64(
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDQ_V2_I]]
+//
 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vpaddq_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vpaddd_u64(
-// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a)
-// CHECK:   ret i64 [[VPADDD_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vpaddd_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret i64 [[VPADDD_U64_I]]
+//
 uint64_t test_vpaddd_u64(uint64x2_t a) {
   return vpaddd_u64(a);
 }
 
-// CHECK-LABEL: @test_vaddvq_s64(
-// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a)
-// CHECK:   ret i64 [[VADDVQ_S64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddvq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret i64 [[VADDVQ_S64_I]]
+//
 int64_t test_vaddvq_s64(int64x2_t a) {
   return vaddvq_s64(a);
 }
 
-// CHECK-LABEL: @test_vaddvq_u64(
-// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a)
-// CHECK:   ret i64 [[VADDVQ_U64_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddvq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret i64 [[VADDVQ_U64_I]]
+//
 uint64_t test_vaddvq_u64(uint64x2_t a) {
   return vaddvq_u64(a);
 }
 
-// CHECK-LABEL: @test_vadd_f64(
-// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
-// CHECK:   ret <1 x double> [[ADD_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vadd_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x double> [[ADD_I]]
+//
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
   return vadd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmul_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
-// CHECK:   ret <1 x double> [[MUL_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmul_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x double> [[MUL_I]]
+//
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
   return vmul_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vdiv_f64(
-// CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
-// CHECK:   ret <1 x double> [[DIV_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vdiv_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x double> [[DIV_I]]
+//
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
   return vdiv_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmla_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
-// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
-// CHECK:   ret <1 x double> [[ADD_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmla_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <1 x double> [[ADD_I]]
+//
 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmla_f64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vmls_f64(
-// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
-// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
-// CHECK:   ret <1 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmls_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <1 x double> [[SUB_I]]
+//
 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vmls_f64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vfma_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
-// CHECK:   ret <1 x double> [[TMP3]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfma_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfma_f64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vfms_f64(
-// CHECK:   [[SUB_I:%.*]] = fneg <1 x double> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a)
-// CHECK:   ret <1 x double> [[TMP3]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfms_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
   return vfms_f64(a, b, c);
 }
 
-// CHECK-LABEL: @test_vsub_f64(
-// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
-// CHECK:   ret <1 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vsub_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x double> [[SUB_I]]
+//
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
   return vsub_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vabd_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VABD2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vabd_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VABD2_I]]
+//
 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
   return vabd_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmax_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VMAX2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmax_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VMAX2_I]]
+//
 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
   return vmax_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmin_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VMIN2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmin_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VMIN2_I]]
+//
 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
   return vmin_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vmaxnm_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VMAXNM2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmaxnm_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VMAXNM2_I]]
+//
 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
   return vmaxnm_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vminnm_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VMINNM2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vminnm_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VMINNM2_I]]
+//
 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
   return vminnm_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vabs_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vabs_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VABS1_I]]
+//
 float64x1_t test_vabs_f64(float64x1_t a) {
   return vabs_f64(a);
 }
 
-// CHECK-LABEL: @test_vneg_f64(
-// CHECK:   [[SUB_I:%.*]] = fneg <1 x double> %a
-// CHECK:   ret <1 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vneg_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <1 x double> [[A]]
+// CHECK-NEXT:    ret <1 x double> [[FNEG_I]]
+//
 float64x1_t test_vneg_f64(float64x1_t a) {
   return vneg_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   return vcvt_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+//
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   return vcvt_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtn_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTN1_I]]
+//
 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
   return vcvtn_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtn_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTN1_I]]
+//
 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
   return vcvtn_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtp_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTP1_I]]
+//
 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
   return vcvtp_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtp_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTP1_I]]
+//
 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
   return vcvtp_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtm_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTM1_I]]
+//
 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
   return vcvtm_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtm_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTM1_I]]
+//
 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
   return vcvtm_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvta_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTA1_I]]
+//
 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
   return vcvta_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvta_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x i64> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTA1_I]]
+//
 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
   return vcvta_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double>
-// CHECK:   ret <1 x double> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[VCVT_I]]
+//
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
   return vcvt_f64_s64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double>
-// CHECK:   ret <1 x double> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[VCVT_I]]
+//
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
   return vcvt_f64_u64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_n_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
-// CHECK:   ret <1 x i64> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_s64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[A]], i32 64)
+// CHECK-NEXT:    ret <1 x i64> [[VCVT_N1]]
+//
 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
   return vcvt_n_s64_f64(a, 64);
 }
 
-// CHECK-LABEL: @test_vcvt_n_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
-// CHECK:   ret <1 x i64> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_u64_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[A]], i32 64)
+// CHECK-NEXT:    ret <1 x i64> [[VCVT_N1]]
+//
 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
   return vcvt_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
-// CHECK:   ret <1 x double> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[A]], i32 64)
+// CHECK-NEXT:    ret <1 x double> [[VCVT_N1]]
+//
 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
   return vcvt_n_f64_s64(a, 64);
 }
 
-// CHECK-LABEL: @test_vcvt_n_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
-// CHECK:   ret <1 x double> [[VCVT_N1]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[A]], i32 64)
+// CHECK-NEXT:    ret <1 x double> [[VCVT_N1]]
+//
 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
   return vcvt_n_f64_u64(a, 64);
 }
 
-// CHECK-LABEL: @test_vrndn_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDN1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrndn_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDN1_I]]
+//
 float64x1_t test_vrndn_f64(float64x1_t a) {
   return vrndn_f64(a);
 }
 
-// CHECK-LABEL: @test_vrnda_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDA1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnda_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDA1_I]]
+//
 float64x1_t test_vrnda_f64(float64x1_t a) {
   return vrnda_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndp_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDP1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrndp_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDP1_I]]
+//
 float64x1_t test_vrndp_f64(float64x1_t a) {
   return vrndp_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndm_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDM1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrndm_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDM1_I]]
+//
 float64x1_t test_vrndm_f64(float64x1_t a) {
   return vrndm_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndx_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDX1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrndx_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDX1_I]]
+//
 float64x1_t test_vrndx_f64(float64x1_t a) {
   return vrndx_f64(a);
 }
 
-// CHECK-LABEL: @test_vrnd_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDZ1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnd_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDZ1_I]]
+//
 float64x1_t test_vrnd_f64(float64x1_t a) {
   return vrnd_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndi_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRNDI1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrndi_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRNDI_V1_I]]
+//
 float64x1_t test_vrndi_f64(float64x1_t a) {
   return vrndi_f64(a);
 }
 
-// CHECK-LABEL: @test_vrsqrte_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrte_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRSQRTE_V1_I]]
+//
 float64x1_t test_vrsqrte_f64(float64x1_t a) {
   return vrsqrte_f64(a);
 }
 
-// CHECK-LABEL: @test_vrecpe_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VRECPE_V1_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrecpe_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRECPE_V1_I]]
+//
 float64x1_t test_vrecpe_f64(float64x1_t a) {
   return vrecpe_f64(a);
 }
 
-// CHECK-LABEL: @test_vsqrt_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
-// CHECK:   ret <1 x double> [[VSQRT_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vsqrt_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VSQRT_I]]
+//
 float64x1_t test_vsqrt_f64(float64x1_t a) {
   return vsqrt_f64(a);
 }
 
-// CHECK-LABEL: @test_vrecps_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   ret <1 x double> [[VRECPS_V2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrecps_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VRECPS_V2_I]]
+//
 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
   return vrecps_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vrsqrts_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> %a, <1 x double> %b)
-// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
-// CHECK:   ret <1 x double> [[VRSQRTS_V2_I]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrts_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[A]], <1 x double> [[B]])
+// CHECK-NEXT:    ret <1 x double> [[VRSQRTS_V2_I]]
+//
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
   return vrsqrts_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vminv_s32(
-// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VMINV_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vminv_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VMINV_S32_I]]
+//
 int32_t test_vminv_s32(int32x2_t a) {
   return vminv_s32(a);
 }
 
-// CHECK-LABEL: @test_vminv_u32(
-// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VMINV_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vminv_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VMINV_U32_I]]
+//
 uint32_t test_vminv_u32(uint32x2_t a) {
   return vminv_u32(a);
 }
 
-// CHECK-LABEL: @test_vmaxv_s32(
-// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VMAXV_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vmaxv_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VMAXV_S32_I]]
+//
 int32_t test_vmaxv_s32(int32x2_t a) {
   return vmaxv_s32(a);
 }
 
-// CHECK-LABEL: @test_vmaxv_u32(
-// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VMAXV_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vmaxv_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VMAXV_U32_I]]
+//
 uint32_t test_vmaxv_u32(uint32x2_t a) {
   return vmaxv_u32(a);
 }
 
-// CHECK-LABEL: @test_vaddv_s32(
-// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VADDV_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vaddv_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VADDV_S32_I]]
+//
 int32_t test_vaddv_s32(int32x2_t a) {
   return vaddv_s32(a);
 }
 
-// CHECK-LABEL: @test_vaddv_u32(
-// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a)
-// CHECK:   ret i32 [[VADDV_U32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vaddv_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i32 [[VADDV_U32_I]]
+//
 uint32_t test_vaddv_u32(uint32x2_t a) {
   return vaddv_u32(a);
 }
 
-// CHECK-LABEL: @test_vaddlv_s32(
-// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a)
-// CHECK:   ret i64 [[VADDLV_S32_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddlv_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i64 [[VADDLV_S32_I]]
+//
 int64_t test_vaddlv_s32(int32x2_t a) {
   return vaddlv_s32(a);
 }
 
-// CHECK-LABEL: @test_vaddlv_u32(
-// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a)
-// CHECK:   ret i64 [[VADDLV_U32_I]]
+// CHECK-LABEL: define dso_local i64 @test_vaddlv_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret i64 [[VADDLV_U32_I]]
+//
 uint64_t test_vaddlv_u32(uint32x2_t a) {
   return vaddlv_u32(a);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c
index 40c5a0a598d68c..67619ae34b0b72 100644
--- a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c
+++ b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon \
 // RUN:  -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -10,10 +10,8 @@
 
 // CHECK-LABEL: @test_vldap1q_lane_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
 //
 uint64x2_t test_vldap1q_lane_u64(uint64_t  *a, uint64x2_t b) {
@@ -22,10 +20,8 @@ uint64x2_t test_vldap1q_lane_u64(uint64_t  *a, uint64x2_t b) {
 
 // CHECK-LABEL: @test_vldap1q_lane_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
 //
 int64x2_t test_vldap1q_lane_s64(int64_t  *a, int64x2_t b) {
@@ -34,10 +30,8 @@ int64x2_t test_vldap1q_lane_s64(int64_t  *a, int64x2_t b) {
 
 // CHECK-LABEL: @test_vldap1q_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[B:%.*]], double [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x double> [[VLDAP1_LANE]]
 //
 float64x2_t test_vldap1q_lane_f64(float64_t  *a, float64x2_t b) {
@@ -46,10 +40,8 @@ float64x2_t test_vldap1q_lane_f64(float64_t  *a, float64x2_t b) {
 
 // CHECK-LABEL: @test_vldap1q_lane_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x i64> [[VLDAP1_LANE]]
 //
 poly64x2_t test_vldap1q_lane_p64(poly64_t  *a, poly64x2_t b) {
@@ -58,10 +50,8 @@ poly64x2_t test_vldap1q_lane_p64(poly64_t  *a, poly64x2_t b) {
 
 // CHECK-LABEL: @test_vldap1_lane_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
 //
 uint64x1_t test_vldap1_lane_u64(uint64_t  *a, uint64x1_t b) {
@@ -70,10 +60,8 @@ uint64x1_t test_vldap1_lane_u64(uint64_t  *a, uint64x1_t b) {
 
 // CHECK-LABEL: @test_vldap1_lane_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
 //
 int64x1_t test_vldap1_lane_s64(int64_t  *a, int64x1_t b) {
@@ -82,10 +70,8 @@ int64x1_t test_vldap1_lane_s64(int64_t  *a, int64x1_t b) {
 
 // CHECK-LABEL: @test_vldap1_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP1]], double [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x double> [[VLDAP1_LANE]]
 //
 float64x1_t test_vldap1_lane_f64(float64_t  *a, float64x1_t b) {
@@ -94,10 +80,8 @@ float64x1_t test_vldap1_lane_f64(float64_t  *a, float64x1_t b) {
 
 // CHECK-LABEL: @test_vldap1_lane_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
-// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8
+// CHECK-NEXT:    [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VLDAP1_LANE]]
 //
 poly64x1_t test_vldap1_lane_p64(poly64_t  *a, poly64x1_t b) {
@@ -106,10 +90,8 @@ poly64x1_t test_vldap1_lane_p64(poly64_t  *a, poly64x1_t b) {
 
 // CHECK-LABEL: @test_vstl1q_lane_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1q_lane_u64(uint64_t  *a, uint64x2_t b) {
@@ -118,10 +100,8 @@ void test_vstl1q_lane_u64(uint64_t  *a, uint64x2_t b) {
 
 // CHECK-LABEL: @test_vstl1q_lane_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1q_lane_s64(int64_t  *a, int64x2_t b) {
@@ -130,10 +110,8 @@ void test_vstl1q_lane_s64(int64_t  *a, int64x2_t b) {
 
 // CHECK-LABEL: @test_vstl1q_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-// CHECK-NEXT:    store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
+// CHECK-NEXT:    store atomic double [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1q_lane_f64(float64_t  *a, float64x2_t b) {
@@ -142,10 +120,8 @@ void test_vstl1q_lane_f64(float64_t  *a, float64x2_t b) {
 
 // CHECK-LABEL: @test_vstl1q_lane_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1q_lane_p64(poly64_t  *a, poly64x2_t b) {
@@ -154,10 +130,8 @@ void test_vstl1q_lane_p64(poly64_t  *a, poly64x2_t b) {
 
 // CHECK-LABEL: @test_vstl1_lane_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1_lane_u64(uint64_t  *a, uint64x1_t b) {
@@ -166,10 +140,8 @@ void test_vstl1_lane_u64(uint64_t  *a, uint64x1_t b) {
 
 // CHECK-LABEL: @test_vstl1_lane_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1_lane_s64(int64_t  *a, int64x1_t b) {
@@ -178,10 +150,8 @@ void test_vstl1_lane_s64(int64_t  *a, int64x1_t b) {
 
 // CHECK-LABEL: @test_vstl1_lane_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK-NEXT:    store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[B:%.*]], i64 0
+// CHECK-NEXT:    store atomic double [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1_lane_f64(float64_t  *a, float64x1_t b) {
@@ -190,10 +160,8 @@ void test_vstl1_lane_f64(float64_t  *a, float64x1_t b) {
 
 // CHECK-LABEL: @test_vstl1_lane_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
-// CHECK-NEXT:    store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0
+// CHECK-NEXT:    store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8
 // CHECK-NEXT:    ret void
 //
 void test_vstl1_lane_p64(poly64_t  *a, poly64x1_t b) {
diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one.c b/clang/test/CodeGen/AArch64/neon-ldst-one.c
index b57df40d8e5c9e..b273428ccba966 100644
--- a/clang/test/CodeGen/AArch64/neon-ldst-one.c
+++ b/clang/test/CodeGen/AArch64/neon-ldst-one.c
@@ -1,5757 +1,7550 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load half, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_dup_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load float, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x float> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_dup_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x float> [[LANE]]
+//
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load double, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x double> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x double> [[LANE]]
+//
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
   return vld1q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
   return vld1q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP1]]
+//
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i32, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP1]]
+//
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load half, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vld1_dup_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load float, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x float> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vld1_dup_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x float> [[LANE]]
+//
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load double, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x double> [[LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vld1_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[TMP1]]
+//
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
   return vld1_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i16, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
   return vld1_dup_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[TMP2:%.*]] = load i64, ptr %a
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP1]]
+//
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
   return vld1_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP1]]
+//
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
   return vld2q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP1]]
+//
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
   return vld2q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]]
+//
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
   return vld2q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP1]]
+//
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
   return vld2q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]]
+//
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
   return vld2_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP1]]
+//
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
   return vld2_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP2]]
+//
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
   return vld3q_dup_u64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP2]]
+//
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
   return vld3q_dup_s64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]]
+//
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
   return vld3q_dup_f64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP2]]
+//
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
   return vld3q_dup_p64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]]
+//
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
   return vld3_dup_f64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP2]]
+//
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
   return vld3_dup_p64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_dup_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP3]]
+//
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
   return vld4q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_dup_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP3]]
+//
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
   return vld4q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]]
+//
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
   return vld4q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP3]]
+//
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
   return vld4q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_dup_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]]
+//
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
   return vld4_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_dup_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr [[A]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP3]]
+//
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
   return vld4_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP4:%.*]] = load half, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
-// CHECK:   ret <8 x half> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x half> [[B]], half [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x half> [[VLD1_LANE]]
+//
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP4:%.*]] = load float, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
-// CHECK:   ret <4 x float> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x float> [[B]], float [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x float> [[VLD1_LANE]]
+//
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP4:%.*]] = load double, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
-// CHECK:   ret <2 x double> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x double> [[B]], double [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x double> [[VLD1_LANE]]
+//
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
   return vld1q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
   return vld1q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP4:%.*]] = load i32, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP4:%.*]] = load half, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
-// CHECK:   ret <4 x half> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vld1_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x half> [[B]], half [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x half> [[VLD1_LANE]]
+//
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP4:%.*]] = load float, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
-// CHECK:   ret <2 x float> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vld1_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x float> [[B]], float [[TMP0]], i64 1
+// CHECK-NEXT:    ret <2 x float> [[VLD1_LANE]]
+//
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP4:%.*]] = load double, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
-// CHECK:   ret <1 x double> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vld1_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VLD1_LANE]]
+//
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
   return vld1_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP4:%.*]] = load i16, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP4:%.*]] = load i64, ptr %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
   return vld1_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_lane_s8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16
+// CHECK-NEXT:    [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP3]]
+//
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
   return vld2q_lane_s8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_lane_u8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16
+// CHECK-NEXT:    [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP3]]
+//
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
   return vld2q_lane_u8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr)
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_lane_p8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16
+// CHECK-NEXT:    [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP3]]
+//
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
   return vld2q_lane_p8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_lane_s8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16
+// CHECK-NEXT:    [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16
+// CHECK-NEXT:    [[SRC_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 32
+// CHECK-NEXT:    [[SRC_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT4]], ptr [[SRC_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[SRC]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X3_T]] [[TMP5]]
+//
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
   return vld3q_lane_s8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_lane_u8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16
+// CHECK-NEXT:    [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16
+// CHECK-NEXT:    [[SRC_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16
+// CHECK-NEXT:    [[SRC_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 32
+// CHECK-NEXT:    [[SRC_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[SRC_COERCE_ELT4]], ptr [[SRC_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[SRC]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X3_T]] [[TMP5]]
+//
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
   return vld3q_lane_u8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP3]]
+//
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP3]]
+//
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP3]]
+//
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
   return vld2q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP3]]
+//
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP3]]
+//
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP3]]
+//
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
   return vld2q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a)
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x half>] [[TMP2]], <8 x half> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, [2 x <8 x half>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP3]]
+//
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x float>] [[TMP2]], <4 x float> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, [2 x <4 x float>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP3]]
+//
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP3]]
+//
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
   return vld2q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP3]]
+//
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP3]]
+//
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
   return vld2q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP3]]
+//
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP3]]
+//
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP3]]
+//
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X2_T]] [[TMP3]]
+//
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
   return vld2_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP3]]
+//
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP3]]
+//
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP3]]
+//
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X2_T]] [[TMP3]]
+//
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
   return vld2_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x half>] [[TMP2]], <4 x half> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, [2 x <4 x half>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP3]]
+//
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a)
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x float>] [[TMP2]], <2 x float> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, [2 x <2 x float>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP3]]
+//
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a)
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP3]]
+//
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
   return vld2_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP3]]
+//
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP3]]
+//
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP3]]
+//
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
   return vld2_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X3_T]] [[TMP5]]
+//
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x i32>] [[TMP3]], <4 x i32> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X3_T]] [[TMP5]]
+//
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP5]]
+//
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
   return vld3q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X3_T]] [[TMP5]]
+//
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x i32>] [[TMP3]], <4 x i32> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X3_T]] [[TMP5]]
+//
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP5]]
+//
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
   return vld3q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a)
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x half>] [[TMP3]], <8 x half> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x half>] [[TMP4]], <8 x half> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] poison, [3 x <8 x half>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X3_T]] [[TMP5]]
+//
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x float>] [[TMP3]], <4 x float> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x float>] [[TMP4]], <4 x float> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] poison, [3 x <4 x float>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X3_T]] [[TMP5]]
+//
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x double>] [[TMP3]], <2 x double> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x double>] [[TMP4]], <2 x double> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP5]]
+//
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
   return vld3q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X3_T]] [[TMP5]]
+//
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
   return vld3q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X3_T]] [[TMP5]]
+//
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP5]]
+//
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
   return vld3q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X3_T]] [[TMP5]]
+//
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a,  [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X3_T]] [[TMP5]]
+//
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x i32>] [[TMP3]], <2 x i32> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X3_T]] [[TMP5]]
+//
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X3_T]] [[TMP5]]
+//
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
   return vld3_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X3_T]] [[TMP5]]
+//
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X3_T]] [[TMP5]]
+//
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x i32>] [[TMP3]], <2 x i32> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X3_T]] [[TMP5]]
+//
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X3_T]] [[TMP5]]
+//
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
   return vld3_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x half>] [[TMP3]], <4 x half> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x half>] [[TMP4]], <4 x half> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] poison, [3 x <4 x half>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X3_T]] [[TMP5]]
+//
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a)
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <2 x float>] [[TMP3]], <2 x float> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x float>] [[TMP4]], <2 x float> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] poison, [3 x <2 x float>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X3_T]] [[TMP5]]
+//
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <1 x double>] [[TMP3]], <1 x double> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x double>] [[TMP4]], <1 x double> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP5]]
+//
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
   return vld3_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X3_T]] [[TMP5]]
+//
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X3_T]] [[TMP5]]
+//
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP5]]
+//
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
   return vld3_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X4_T]] [[TMP7]]
+//
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
   return vld4q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X4_T]] [[TMP7]]
+//
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x i32>] [[TMP5]], <4 x i32> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i32>] [[TMP6]], <4 x i32> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X4_T]] [[TMP7]]
+//
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP7]]
+//
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
   return vld4q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X4_T]] [[TMP7]]
+//
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
   return vld4q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X4_T]] [[TMP7]]
+//
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x i32>] [[TMP5]], <4 x i32> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i32>] [[TMP6]], <4 x i32> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X4_T]] [[TMP7]]
+//
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP7]]
+//
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
   return vld4q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a)
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x half> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x half> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x half>] [[TMP4]], <8 x half> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x half>] [[TMP5]], <8 x half> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x half>] [[TMP6]], <8 x half> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] poison, [4 x <8 x half>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X4_T]] [[TMP7]]
+//
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x float> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x float> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x float>] [[TMP4]], <4 x float> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x float>] [[TMP5]], <4 x float> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x float>] [[TMP6]], <4 x float> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] poison, [4 x <4 x float>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X4_T]] [[TMP7]]
+//
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x double> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x double> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x double> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x double> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x double>] [[TMP4]], <2 x double> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x double>] [[TMP5]], <2 x double> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x double>] [[TMP6]], <2 x double> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP7]]
+//
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
   return vld4q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a)
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X4_T]] [[TMP7]]
+//
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
   return vld4q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a)
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X4_T]] [[TMP7]]
+//
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP7]]
+//
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
   return vld4q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X4_T]] [[TMP7]]
+//
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X4_T]] [[TMP7]]
+//
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x i32>] [[TMP5]], <2 x i32> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i32>] [[TMP6]], <2 x i32> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X4_T]] [[TMP7]]
+//
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X4_T]] [[TMP7]]
+//
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
   return vld4_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X4_T]] [[TMP7]]
+//
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X4_T]] [[TMP7]]
+//
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x i32>] [[TMP5]], <2 x i32> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i32>] [[TMP6]], <2 x i32> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X4_T]] [[TMP7]]
+//
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X4_T]] [[TMP7]]
+//
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
   return vld4_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x half>] [[TMP4]], <4 x half> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x half>] [[TMP5]], <4 x half> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x half>] [[TMP6]], <4 x half> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] poison, [4 x <4 x half>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X4_T]] [[TMP7]]
+//
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a)
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <2 x float>] [[TMP4]], <2 x float> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <2 x float>] [[TMP5]], <2 x float> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x float>] [[TMP6]], <2 x float> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] poison, [4 x <2 x float>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X4_T]] [[TMP7]]
+//
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a)
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <1 x double> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <1 x double> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <1 x double> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <1 x double> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <1 x double>] [[TMP4]], <1 x double> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <1 x double>] [[TMP5]], <1 x double> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x double>] [[TMP6]], <1 x double> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP7]]
+//
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
   return vld4_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a)
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X4_T]] [[TMP7]]
+//
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a)
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X4_T]] [[TMP7]]
+//
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8
+// CHECK-NEXT:    [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8
+// CHECK-NEXT:    [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8
+// CHECK-NEXT:    [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3
+// CHECK-NEXT:    [[TMP7:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP7]]
+//
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
   return vld4_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   store i32 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   store i32 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
-// CHECK:   store half [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[B]], i64 7
+// CHECK-NEXT:    store half [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-// CHECK:   store float [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[B]], i64 3
+// CHECK-NEXT:    store float [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-// CHECK:   store double [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x double> [[B]], i64 1
+// CHECK-NEXT:    store double [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
   vst1q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
   vst1q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   store i32 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   store i32 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
-// CHECK:   store half [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[B]], i64 3
+// CHECK-NEXT:    store half [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-// CHECK:   store float [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[B]], i64 1
+// CHECK-NEXT:    store float [[TMP0]], ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
-// CHECK:   store double [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    store double [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
   vst1_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   store i16 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   store i64 [[TMP3]], ptr %a
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
   vst1_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
   vst2q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
   vst2q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
   vst2q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
   vst2q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
   vst2q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
   vst2q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
   vst2q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
   vst2_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
   vst2_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
   vst2_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
   vst2_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
   vst3q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
   vst3q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
   vst3q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
   vst3q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
   vst3q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
   vst3q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
   vst3q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
   vst3_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
   vst3_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
   vst3_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
   vst3_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
   vst4q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
   vst4q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
   vst4q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
   vst4q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
   vst4q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
   vst4q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
   vst4q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_u8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_u16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_u32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_u64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
   vst4_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_s8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_s16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_s32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_s64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
   vst4_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_f16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_f32(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i64 1, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_f64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
   vst4_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_p8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_p16(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_lane_p64(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8
+// CHECK-NEXT:    [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8
+// CHECK-NEXT:    [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8
+// CHECK-NEXT:    [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
+// CHECK-NEXT:    [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8
+// CHECK-NEXT:    [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24
+// CHECK-NEXT:    [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
   vst4_lane_p64(a, b, 0);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-misc-constrained.c b/clang/test/CodeGen/AArch64/neon-misc-constrained.c
index e24e129d2bc7d0..be144fcdc31c58 100644
--- a/clang/test/CodeGen/AArch64/neon-misc-constrained.c
+++ b/clang/test/CodeGen/AArch64/neon-misc-constrained.c
@@ -1,17 +1,11 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck --check-prefix=UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -ffp-exception-behavior=strict \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -disable-O0-optnone -S -o - %s \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-exception-behavior=strict \
-// RUN:  -disable-O0-optnone -S -o - %s \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck --check-prefix=CONSTRAINED %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -19,42 +13,66 @@
 
 #include <arm_neon.h>
 
-// COMMON-LABEL: test_vrndaq_f64
-// COMMONIR:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a)
-// CONSTRAINED:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %a, metadata !"fpexcept.strict")
-// CHECK-ASM:     frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-// COMMONIR:      ret <2 x double> [[VRNDA1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[VRNDA1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[VRNDA1_I]]
+//
 float64x2_t test_vrndaq_f64(float64x2_t a) {
   return vrndaq_f64(a);
 }
 
-// COMMON-LABEL: test_vrndpq_f64
-// COMMONIR:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
-// CONSTRAINED:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %a, metadata !"fpexcept.strict")
-// CHECK-ASM:     frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-// COMMONIR:      ret <2 x double> [[VRNDP1_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[VRNDP1_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[VRNDP1_I]]
+//
 float64x2_t test_vrndpq_f64(float64x2_t a) {
   return vrndpq_f64(a);
 }
 
-// COMMON-LABEL: test_vsqrtq_f32
-// COMMONIR:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
-// CONSTRAINED:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-// COMMONIR:      ret <4 x float> [[VSQRT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32(
+// UNCONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x float> [[VSQRT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32(
+// CONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x float> [[VSQRT_I]]
+//
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
   return vsqrtq_f32(a);
 }
 
-// COMMON-LABEL: test_vsqrtq_f64
-// COMMONIR:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
-// CONSTRAINED:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-// COMMONIR:      ret <2 x double> [[VSQRT_I]]
+// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64(
+// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <2 x double> [[VSQRT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64(
+// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <2 x double> [[VSQRT_I]]
+//
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
   return vsqrtq_f64(a);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c
index 165f33a9f399fe..c840e4753ab6bc 100644
--- a/clang/test/CodeGen/AArch64/neon-misc.c
+++ b/clang/test/CodeGen/AArch64/neon-misc.c
@@ -1,2718 +1,3416 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vceqz_s8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+//
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: @test_vceqz_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+//
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: @test_vceqz_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <2 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+//
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: @test_vceqz_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+//
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: @test_vceqz_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_u64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+//
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: @test_vceqz_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+//
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_s8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+//
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+//
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+//
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+//
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: @test_vceqz_u8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+//
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: @test_vceqz_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+//
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: @test_vceqz_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <2 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+//
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_u8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+//
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+//
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+//
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+//
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: @test_vceqz_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x float> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <2 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCEQZ_I]]
+//
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: @test_vceqz_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oeq <1 x double> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <1 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCEQZ_I]]
+//
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oeq <4 x float> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <4 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCEQZ_I]]
+//
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: @test_vceqz_p8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCEQZ_I]]
+//
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_p8(
-// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCEQZ_I]]
+//
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oeq <2 x double> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <2 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+//
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: @test_vceqzq_p64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCEQZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCEQZ_I]]
+//
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: @test_vcgez_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCGEZ_I]]
+//
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: @test_vcgez_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGEZ_I]]
+//
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: @test_vcgez_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <2 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCGEZ_I]]
+//
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: @test_vcgez_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <1 x i64> [[A]], splat (i64 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCGEZ_I]]
+//
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgezq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCGEZ_I]]
+//
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGEZ_I]]
+//
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCGEZ_I]]
+//
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sge <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <2 x i64> [[A]], splat (i64 -1)
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCGEZ_I]]
+//
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: @test_vcgez_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x float> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <2 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCGEZ_I]]
+//
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgez_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oge <1 x double> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <1 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCGEZ_I]]
+//
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oge <4 x float> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <4 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCGEZ_I]]
+//
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgezq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp oge <2 x double> %a, zeroinitializer
-// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCGEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <2 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCGEZ_I]]
+//
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: @test_vclez_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vclez_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <8 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCLEZ_I]]
+//
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: @test_vclez_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i16> [[A]], splat (i16 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCLEZ_I]]
+//
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: @test_vclez_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <2 x i32> [[A]], splat (i32 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCLEZ_I]]
+//
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: @test_vclez_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <1 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCLEZ_I]]
+//
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: @test_vclezq_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vclezq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <16 x i8> [[A]], splat (i8 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCLEZ_I]]
+//
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: @test_vclezq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <8 x i16> [[A]], splat (i16 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCLEZ_I]]
+//
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: @test_vclezq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[A]], splat (i32 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCLEZ_I]]
+//
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: @test_vclezq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sle <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <2 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCLEZ_I]]
+//
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: @test_vclez_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x float> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCLEZ_I]]
+//
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: @test_vclez_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ole <1 x double> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <1 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCLEZ_I]]
+//
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: @test_vclezq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ole <4 x float> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCLEZ_I]]
+//
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: @test_vclezq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ole <2 x double> %a, zeroinitializer
-// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCLEZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCLEZ_I]]
+//
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcgtz_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VCGTZ_I]]
+//
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGTZ_I]]
+//
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <2 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCGTZ_I]]
+//
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <1 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCGTZ_I]]
+//
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_s8(
-// CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtzq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <16 x i8> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[VCGTZ_I]]
+//
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i16> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGTZ_I]]
+//
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i32> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCGTZ_I]]
+//
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp sgt <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <2 x i64> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCGTZ_I]]
+//
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x float> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <2 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCGTZ_I]]
+//
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgtz_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ogt <1 x double> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <1 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCGTZ_I]]
+//
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ogt <4 x float> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <4 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCGTZ_I]]
+//
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: @test_vcgtzq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp ogt <2 x double> %a, zeroinitializer
-// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCGTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <2 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCGTZ_I]]
+//
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: @test_vcltz_s8(
-// CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   ret <8 x i8> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcltz_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <8 x i8> [[A]], splat (i8 7)
+// CHECK-NEXT:    ret <8 x i8> [[A_LOBIT]]
+//
 uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: @test_vcltz_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i16> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:   ret <4 x i16> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <4 x i16> [[A]], splat (i16 15)
+// CHECK-NEXT:    ret <4 x i16> [[A_LOBIT]]
+//
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: @test_vcltz_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i32> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <2 x i32> [[A]], splat (i32 31)
+// CHECK-NEXT:    ret <2 x i32> [[A_LOBIT]]
+//
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: @test_vcltz_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <1 x i64> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_s64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <1 x i64> [[A]], splat (i64 63)
+// CHECK-NEXT:    ret <1 x i64> [[A_LOBIT]]
+//
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_s8(
-// CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
-// CHECK:   ret <16 x i8> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcltzq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <16 x i8> [[A]], splat (i8 7)
+// CHECK-NEXT:    ret <16 x i8> [[A_LOBIT]]
+//
 uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <8 x i16> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
-// CHECK:   ret <8 x i16> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <8 x i16> [[A]], splat (i16 15)
+// CHECK-NEXT:    ret <8 x i16> [[A_LOBIT]]
+//
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <4 x i32> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <4 x i32> [[A]], splat (i32 31)
+// CHECK-NEXT:    ret <4 x i32> [[A_LOBIT]]
+//
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = icmp slt <2 x i64> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <2 x i64> [[A]], splat (i64 63)
+// CHECK-NEXT:    ret <2 x i64> [[A_LOBIT]]
+//
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: @test_vcltz_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x float> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
-// CHECK:   ret <2 x i32> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <2 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VCLTZ_I]]
+//
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
 
-// CHECK-LABEL: @test_vcltz_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp olt <1 x double> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <1 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VCLTZ_I]]
+//
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp olt <4 x float> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
-// CHECK:   ret <4 x i32> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[VCLTZ_I]]
+//
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: @test_vcltzq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = fcmp olt <2 x double> %a, zeroinitializer
-// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VCLTZ_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <2 x double> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VCLTZ_I]]
+//
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrev16_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev16_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev16_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev16q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev32_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev32_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev32q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: @test_vrev64_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev64_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrev64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrev64q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_s8(
-// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a)
-// CHECK:   ret <4 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADDL_I]]
+//
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %a)
-// CHECK:   ret <2 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
+//
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %a)
-// CHECK:   ret <1 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
+//
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u8(
-// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a)
-// CHECK:   ret <4 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VPADDL_I]]
+//
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %a)
-// CHECK:   ret <2 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
+//
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: @test_vpaddl_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %a)
-// CHECK:   ret <1 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
+//
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s8(
-// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a)
-// CHECK:   ret <8 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDL_I]]
+//
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %a)
-// CHECK:   ret <4 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
+//
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %a)
-// CHECK:   ret <2 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
+//
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u8(
-// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a)
-// CHECK:   ret <8 x i16> [[VPADDL_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VPADDL_I]]
+//
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %a)
-// CHECK:   ret <4 x i32> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
+//
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: @test_vpaddlq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %a)
-// CHECK:   ret <2 x i64> [[VPADDL1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
+//
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
-// CHECK-LABEL: @test_vpadal_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b)
-// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_s8(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VPADAL_I]], [[A]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_s16(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_s32(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b)
-// CHECK:   [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a
-// CHECK:   ret <4 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_u8(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VPADAL_I]], [[A]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a
-// CHECK:   ret <2 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_u16(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadal_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_u32(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b)
-// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_s8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[VPADAL_I]], [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_s16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_s32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u8(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b)
-// CHECK:   [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a
-// CHECK:   ret <8 x i16> [[TMP1]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_u8(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[VPADAL_I]], [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %b)
-// CHECK:   [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a
-// CHECK:   ret <4 x i32> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_u16(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vpadalq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %b)
-// CHECK:   [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_u32(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VPADAL1_I]], [[A]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqabs_s8(
-// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VQABS_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqabs_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQABS_V_I]]
+//
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s8(
-// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqabsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VQABSQ_V_I]]
+//
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vqabs_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %a)
-// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQABS_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqabs_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VQABS_V1_I]]
+//
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %a)
-// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQABSQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqabsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VQABSQ_V1_I]]
+//
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vqabs_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %a)
-// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQABS_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqabs_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VQABS_V1_I]]
+//
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %a)
-// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQABSQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqabsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VQABSQ_V1_I]]
+//
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vqabsq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a)
-// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQABSQ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqabsq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VQABSQ_V1_I]]
+//
 int64x2_t test_vqabsq_s64(int64x2_t a) {
   return vqabsq_s64(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s8(
-// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqneg_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQNEG_V_I]]
+//
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s8(
-// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqnegq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VQNEGQ_V_I]]
+//
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %a)
-// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQNEG_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqneg_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VQNEG_V1_I]]
+//
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %a)
-// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VQNEGQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqnegq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VQNEGQ_V1_I]]
+//
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: @test_vqneg_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %a)
-// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQNEG_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqneg_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VQNEG_V1_I]]
+//
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %a)
-// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VQNEGQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqnegq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VQNEGQ_V1_I]]
+//
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
-// CHECK-LABEL: @test_vqnegq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> %a)
-// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
-// CHECK:   ret <2 x i64> [[VQNEGQ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vqnegq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VQNEGQ_V1_I]]
+//
 int64x2_t test_vqnegq_s64(int64x2_t a) {
   return vqnegq_s64(a);
 }
 
-// CHECK-LABEL: @test_vneg_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
-// CHECK:   ret <8 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vneg_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <8 x i8> [[SUB_I]]
+//
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s8(
-// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
-// CHECK:   ret <16 x i8> [[SUB_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vnegq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <16 x i8> [[SUB_I]]
+//
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: @test_vneg_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
-// CHECK:   ret <4 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vneg_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
+//
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s16(
-// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
-// CHECK:   ret <8 x i16> [[SUB_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vnegq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
+//
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: @test_vneg_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
-// CHECK:   ret <2 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vneg_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
+//
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s32(
-// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
-// CHECK:   ret <4 x i32> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vnegq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
+//
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_s64(
-// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
-// CHECK:   ret <2 x i64> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vnegq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, [[A]]
+// CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 int64x2_t test_vnegq_s64(int64x2_t a) {
   return vnegq_s64(a);
 }
 
-// CHECK-LABEL: @test_vneg_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x float> %a
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vneg_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[A]]
+// CHECK-NEXT:    ret <2 x float> [[FNEG_I]]
+//
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_f32(
-// CHECK:   [[SUB_I:%.*]] = fneg <4 x float> %a
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vnegq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[A]]
+// CHECK-NEXT:    ret <4 x float> [[FNEG_I]]
+//
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
-// CHECK-LABEL: @test_vnegq_f64(
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x double> %a
-// CHECK:   ret <2 x double> [[SUB_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vnegq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[A]]
+// CHECK-NEXT:    ret <2 x double> [[FNEG_I]]
+//
 float64x2_t test_vnegq_f64(float64x2_t a) {
   return vnegq_f64(a);
 }
 
-// CHECK-LABEL: @test_vabs_s8(
-// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VABS_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vabs_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VABS_I]]
+//
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s8(
-// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VABS_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vabsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VABS_I]]
+//
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vabs_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %a)
-// CHECK:   ret <4 x i16> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vabs_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VABS1_I]]
+//
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %a)
-// CHECK:   ret <8 x i16> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vabsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VABS1_I]]
+//
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vabs_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vabs_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VABS1_I]]
+//
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vabsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VABS1_I]]
+//
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> %a)
-// CHECK:   ret <2 x i64> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vabsq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VABS1_I]]
+//
 int64x2_t test_vabsq_s64(int64x2_t a) {
   return vabsq_s64(a);
 }
 
-// CHECK-LABEL: @test_vabs_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vabs_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VABS1_I]]
+//
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vabsq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VABS1_I]]
+//
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
-// CHECK-LABEL: @test_vabsq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VABS1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vabsq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VABS1_I]]
+//
 float64x2_t test_vabsq_f64(float64x2_t a) {
   return vabsq_f64(a);
 }
 
-// CHECK-LABEL: @test_vuqadd_s8(
-// CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b)
-// CHECK:   ret <8 x i8> [[VUQADD_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <8 x i8> [[VUQADD_I]]
+//
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
   return vuqadd_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s8(
-// CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b)
-// CHECK:   ret <16 x i8> [[VUQADD_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <16 x i8> [[VUQADD_I]]
+//
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
   return vuqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b)
-// CHECK:   ret <4 x i16> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VUQADD2_I]]
+//
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
   return vuqadd_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b)
-// CHECK:   ret <8 x i16> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VUQADD2_I]]
+//
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
   return vuqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuqadd_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b)
-// CHECK:   ret <2 x i32> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VUQADD2_I]]
+//
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
   return vuqadd_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b)
-// CHECK:   ret <4 x i32> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VUQADD2_I]]
+//
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
   return vuqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuqaddq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-// CHECK:   ret <2 x i64> [[VUQADD2_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
+// CHECK-NEXT:    ret <2 x i64> [[VUQADD2_I]]
+//
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
   return vuqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vcls_s8(
-// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCLS_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCLS_V_I]]
+//
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: @test_vcls_u8(
-// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCLS_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCLS_V_I]]
+//
 int8x8_t test_vcls_u8(uint8x8_t a) {
   return vcls_u8(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s8(
-// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCLSQ_V_I]]
+//
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u8(
-// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCLSQ_V_I]]
+//
 int8x16_t test_vclsq_u8(uint8x16_t a) {
   return vclsq_u8(a);
 }
 
-// CHECK-LABEL: @test_vcls_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
+//
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: @test_vcls_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLS_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
+//
 int16x4_t test_vcls_u16(uint16x4_t a) {
   return vcls_u16(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
+//
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
+//
 int16x8_t test_vclsq_u16(uint16x8_t a) {
   return vclsq_u16(a);
 }
 
-// CHECK-LABEL: @test_vcls_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
+//
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: @test_vcls_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a)
-// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLS_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
+//
 int32x2_t test_vcls_u32(uint32x2_t a) {
   return vcls_u32(a);
 }
 
-// CHECK-LABEL: @test_vclsq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
+//
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
-// CHECK-LABEL: @test_vclsq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a)
-// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLSQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
+//
 int32x4_t test_vclsq_u32(uint32x4_t a) {
   return vclsq_u32(a);
 }
 
-// CHECK-LABEL: @test_vclz_s8(
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s8(
-// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
-// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
+//
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: @test_vclz_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
+//
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: @test_vclz_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: @test_vclzq_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
+// CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
+//
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: @test_vclz_u8(
-// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
-// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
+//
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u8(
-// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
-// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
+//
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: @test_vclz_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VCLZ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
+// CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
+//
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <8 x i16> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
+// CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
+//
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: @test_vclz_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
-// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VCLZ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
+// CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
+//
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: @test_vclzq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
-// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
-// CHECK:   ret <4 x i32> [[VCLZQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
+// CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
+//
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
-// CHECK-LABEL: @test_vcnt_s8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_s8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: @test_vcnt_u8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_u8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: @test_vcnt_p8(
-// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VCNT_V_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
+//
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: @test_vcntq_p8(
-// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
+//
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
-// CHECK-LABEL: @test_vmvn_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: @test_vmvn_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1)
-// CHECK:   ret <4 x i16> [[NEG_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <4 x i16> [[NOT_I]]
+//
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1)
-// CHECK:   ret <8 x i16> [[NEG_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <8 x i16> [[NOT_I]]
+//
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: @test_vmvn_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1)
-// CHECK:   ret <2 x i32> [[NEG_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <2 x i32> [[NOT_I]]
+//
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_s32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1)
-// CHECK:   ret <4 x i32> [[NEG_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
+//
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1)
-// CHECK:   ret <4 x i16> [[NEG_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <4 x i16> [[NOT_I]]
+//
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u16(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1)
-// CHECK:   ret <8 x i16> [[NEG_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    ret <8 x i16> [[NOT_I]]
+//
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: @test_vmvn_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1)
-// CHECK:   ret <2 x i32> [[NEG_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <2 x i32> [[NOT_I]]
+//
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_u32(
-// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1)
-// CHECK:   ret <4 x i32> [[NEG_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1)
+// CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
+//
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: @test_vmvn_p8(
-// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1)
-// CHECK:   ret <8 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <8 x i8> [[NOT_I]]
+//
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: @test_vmvnq_p8(
-// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1)
-// CHECK:   ret <16 x i8> [[NEG_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1)
+// CHECK-NEXT:    ret <16 x i8> [[NOT_I]]
+//
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
-// CHECK-LABEL: @test_vrbit_s8(
-// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VRBIT_I]]
+//
 int8x8_t test_vrbit_s8(int8x8_t a) {
   return vrbit_s8(a);
 }
 
-// CHECK-LABEL: @test_vrbitq_s8(
-// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VRBIT_I]]
+//
 int8x16_t test_vrbitq_s8(int8x16_t a) {
   return vrbitq_s8(a);
 }
 
-// CHECK-LABEL: @test_vrbit_u8(
-// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VRBIT_I]]
+//
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
   return vrbit_u8(a);
 }
 
-// CHECK-LABEL: @test_vrbitq_u8(
-// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VRBIT_I]]
+//
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
   return vrbitq_u8(a);
 }
 
-// CHECK-LABEL: @test_vrbit_p8(
-// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a)
-// CHECK:   ret <8 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VRBIT_I]]
+//
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
   return vrbit_p8(a);
 }
 
-// CHECK-LABEL: @test_vrbitq_p8(
-// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
-// CHECK:   ret <16 x i8> [[VRBIT_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    ret <16 x i8> [[VRBIT_I]]
+//
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
   return vrbitq_p8(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
+//
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
+//
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: @test_vmovn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
+//
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
-// CHECK:   ret <8 x i8> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
+//
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
-// CHECK:   ret <4 x i16> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
+//
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: @test_vmovn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
-// CHECK:   ret <2 x i32> [[VMOVN_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
+//
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
-// CHECK-LABEL: @test_vmovn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
   return vmovn_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vmovn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[B]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
   return vmovn_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vmovn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
   return vmovn_high_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vmovn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
   return vmovn_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vmovn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[B]] to <4 x i16>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
   return vmovn_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vmovn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
   return vmovn_high_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovun_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovun_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVUN_V1_I]]
+//
 int8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovun_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVUN_V1_I]]
+//
 int16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVUN_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovun_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVUN_V1_I]]
+//
 int32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
-// CHECK-LABEL: @test_vqmovun_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovun_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vqmovun_high_s16(uint8x8_t a, int16x8_t b) {
   return vqmovun_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovun_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %b)
-// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovun_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vqmovun_high_s32(uint16x4_t a, int32x4_t b) {
   return vqmovun_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovun_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %b)
-// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovun_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vqmovun_high_s64(uint32x2_t a, int64x2_t b) {
   return vqmovun_high_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
+//
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
+//
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
+//
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_s16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
   return vqmovn_high_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %b)
-// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_s32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
   return vqmovn_high_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %b)
-// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_s64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
   return vqmovn_high_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a)
-// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[A]])
+// CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
+//
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <4 x i16> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
+//
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %a)
-// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
-// CHECK:   ret <2 x i32> [[VQMOVN_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
+//
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_u16(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
   return vqmovn_high_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %b)
-// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_u32(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
   return vqmovn_high_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vqmovn_high_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %b)
-// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_u64(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
   return vqmovn_high_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vshll_n_s8(
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 8);
 }
 
-// CHECK-LABEL: @test_vshll_n_s16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 16);
 }
 
-// CHECK-LABEL: @test_vshll_n_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 32);
 }
 
-// CHECK-LABEL: @test_vshll_n_u8(
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 8);
 }
 
-// CHECK-LABEL: @test_vshll_n_u16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 16);
 }
 
-// CHECK-LABEL: @test_vshll_n_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 32);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
   return vshll_high_n_s8(a, 8);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
   return vshll_high_n_s16(a, 16);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
   return vshll_high_n_s32(a, 32);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
-// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8)
-// CHECK:   ret <8 x i16> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8)
+// CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
+//
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
   return vshll_high_n_u8(a, 8);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK:   ret <4 x i32> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
+//
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
   return vshll_high_n_u16(a, 16);
 }
 
-// CHECK-LABEL: @test_vshll_high_n_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32)
-// CHECK:   ret <2 x i64> [[VSHLL_N]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32)
+// CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
+//
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
   return vshll_high_n_u32(a, 32);
 }
 
-// CHECK-LABEL: @test_vcvt_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a)
-// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[A]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_high_f16_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %b)
-// CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK:   ret <8 x half> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt_high_f16_f32(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[B]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <4 x half>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
   return vcvt_high_f16_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> %a to <2 x float>
-// CHECK:   ret <2 x float> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptrunc <2 x double> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+//
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
   return vcvt_f32_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_high_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> %b to <2 x float>
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f64(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[B]] to <2 x float>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
   return vcvt_high_f32_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcvtx_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvtx_f32_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VCVTX_F32_V1_I]]
+//
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
   return vcvtx_f32_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtx_high_f32_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b)
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvtx_high_f32_f64(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[B]])
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
   return vcvtx_high_f32_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
-// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VCVT_F32_F161_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_F32_F16_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
+// CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I]]
+//
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vcvt_high_f32_f16(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]])
-// CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
-// CHECK:   ret <4 x float> [[VCVT_F32_F161_I_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[VCVT_F32_F16_I_I:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]])
+// CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I_I]]
+//
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
   return vcvt_high_f32_f16(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f64_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> %a to <2 x double>
-// CHECK:   ret <2 x double> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_f64_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fpext <2 x float> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[VCVT_I]]
+//
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
   return vcvt_f64_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_high_f64_f32(
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I_I]] to <2 x double>
-// CHECK:   ret <2 x double> [[VCVT_I_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_high_f64_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[VCVT_I_I]]
+//
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
   return vcvt_high_f64_f32(a);
 }
 
-// CHECK-LABEL: @test_vrndnq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDN1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndnq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDN1_I]]
+//
 float64x2_t test_vrndnq_f64(float64x2_t a) {
   return vrndnq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndaq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDA1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndaq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDA1_I]]
+//
 float64x2_t test_vrndaq_f64(float64x2_t a) {
   return vrndaq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndpq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDP1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndpq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDP1_I]]
+//
 float64x2_t test_vrndpq_f64(float64x2_t a) {
   return vrndpq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndmq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDM1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndmq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDM1_I]]
+//
 float64x2_t test_vrndmq_f64(float64x2_t a) {
   return vrndmq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndxq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDX1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndxq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDX1_I]]
+//
 float64x2_t test_vrndxq_f64(float64x2_t a) {
   return vrndxq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDZ1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDZ1_I]]
+//
 float64x2_t test_vrndq_f64(float64x2_t a) {
   return vrndq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrndiq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRNDI1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrndiq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRNDIQ_V1_I]]
+//
 float64x2_t test_vrndiq_f64(float64x2_t a) {
   return vrndiq_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTZ1_I]]
+//
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTZ1_I]]
+//
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTZ1_I]]
+//
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
   return vcvtq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTZ1_I]]
+//
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[TMP1]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTZ1_I]]
+//
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[TMP1]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTZ1_I]]
+//
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
   return vcvtq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtn_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTN1_I]]
+//
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
   return vcvtn_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtnq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTN1_I]]
+//
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
   return vcvtnq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtnq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTN1_I]]
+//
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
   return vcvtnq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtn_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTN1_I]]
+//
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
   return vcvtn_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtnq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTN1_I]]
+//
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
   return vcvtnq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtnq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTN1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTN1_I]]
+//
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
   return vcvtnq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtp_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTP1_I]]
+//
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
   return vcvtp_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtpq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTP1_I]]
+//
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
   return vcvtpq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtpq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTP1_I]]
+//
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
   return vcvtpq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtp_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTP1_I]]
+//
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
   return vcvtp_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtpq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTP1_I]]
+//
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
   return vcvtpq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtpq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTP1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTP1_I]]
+//
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
   return vcvtpq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtm_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTM1_I]]
+//
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
   return vcvtm_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtmq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTM1_I]]
+//
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
   return vcvtmq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtmq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTM1_I]]
+//
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
   return vcvtmq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvtm_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTM1_I]]
+//
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
   return vcvtm_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtmq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTM1_I]]
+//
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
   return vcvtmq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtmq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTM1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTM1_I]]
+//
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
   return vcvtmq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvta_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_s32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTA1_I]]
+//
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
   return vcvta_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtaq_s32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_s32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTA1_I]]
+//
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
   return vcvtaq_s32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtaq_s64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_s64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTA1_I]]
+//
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
   return vcvtaq_s64_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvta_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x i32> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_u32_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTA1_I]]
+//
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
   return vcvta_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtaq_u32_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x i32> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_u32_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTA1_I]]
+//
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
   return vcvtaq_u32_f32(a);
 }
 
-// CHECK-LABEL: @test_vcvtaq_u64_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x i64> [[VCVTA1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_u64_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTA1_I]]
+//
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
   return vcvtaq_u64_f64(a);
 }
 
-// CHECK-LABEL: @test_vrsqrte_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrte_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRSQRTE_V1_I]]
+//
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrteq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrteq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRSQRTEQ_V1_I]]
+//
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: @test_vrsqrteq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrteq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRSQRTEQ_V1_I]]
+//
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
   return vrsqrteq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrecpe_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrecpe_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRECPE_V1_I]]
+//
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecpeq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrecpeq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRECPEQ_V1_I]]
+//
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: @test_vrecpeq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrecpeq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRECPEQ_V1_I]]
+//
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
   return vrecpeq_f64(a);
 }
 
-// CHECK-LABEL: @test_vrecpe_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %a)
-// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vrecpe_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VRECPE_V1_I]]
+//
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: @test_vrecpeq_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %a)
-// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vrecpeq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VRECPEQ_V1_I]]
+//
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
-// CHECK-LABEL: @test_vsqrt_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)
-// CHECK:   ret <2 x float> [[VSQRT_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vsqrt_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VSQRT_I]]
+//
 float32x2_t test_vsqrt_f32(float32x2_t a) {
   return vsqrt_f32(a);
 }
 
-// CHECK-LABEL: @test_vsqrtq_f32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
-// CHECK:   ret <4 x float> [[VSQRT_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vsqrtq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VSQRT_I]]
+//
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
   return vsqrtq_f32(a);
 }
 
-// CHECK-LABEL: @test_vsqrtq_f64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
-// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
-// CHECK:   ret <2 x double> [[VSQRT_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vsqrtq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VSQRT_I]]
+//
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
   return vsqrtq_f64(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+//
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vcvt_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
-// CHECK:   ret <2 x float> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+//
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f32_s32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
+//
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f32_u32(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
-// CHECK:   ret <4 x float> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
+//
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f64_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> %a to <2 x double>
-// CHECK:   ret <2 x double> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[VCVT_I]]
+//
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
   return vcvtq_f64_s64(a);
 }
 
-// CHECK-LABEL: @test_vcvtq_f64_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> %a to <2 x double>
-// CHECK:   ret <2 x double> [[VCVT_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[VCVT_I]]
+//
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
   return vcvtq_f64_u64(a);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c
index 1ffbbd5d9bc42f..7b2e36c31bfaeb 100644
--- a/clang/test/CodeGen/AArch64/neon-perm.c
+++ b/clang/test/CodeGen/AArch64/neon-perm.c
@@ -1,1932 +1,2023 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vuzp1_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
   return vuzp1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
   return vuzp1q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
   return vuzp1_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
   return vuzp1q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
   return vuzp1_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
   return vuzp1q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
   return vuzp1q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
   return vuzp1q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp1_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
   return vuzp1q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp1_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
   return vuzp1q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
   return vuzp1q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vuzp1_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
   return vuzp1_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vuzp1q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
   return vuzp1q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vuzp1q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
   return vuzp1q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
   return vuzp1q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp1_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
   return vuzp1q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
   return vuzp2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
   return vuzp2q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
   return vuzp2_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
   return vuzp2q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
   return vuzp2_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
   return vuzp2q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
   return vuzp2q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
   return vuzp2q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp2_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
   return vuzp2q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp2_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
   return vuzp2q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
   return vuzp2q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vuzp2_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
   return vuzp2_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vuzp2q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
   return vuzp2q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vuzp2q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
   return vuzp2q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
   return vuzp2q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp2_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
   return vuzp2q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
   return vzip1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
   return vzip1q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
   return vzip1_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
   return vzip1q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
   return vzip1_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
   return vzip1q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
   return vzip1q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
   return vzip1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
   return vzip1q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
   return vzip1_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
   return vzip1q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
   return vzip1_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
   return vzip1q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
   return vzip1q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vzip1_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
   return vzip1_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vzip1q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
   return vzip1q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vzip1q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
   return vzip1q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
   return vzip1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
   return vzip1q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
   return vzip1_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
   return vzip1q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
   return vzip2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
   return vzip2q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
   return vzip2_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
   return vzip2q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
   return vzip2_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
   return vzip2q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
   return vzip2q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
   return vzip2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
   return vzip2q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
   return vzip2_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
   return vzip2q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
   return vzip2_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
   return vzip2q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
   return vzip2q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vzip2_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
   return vzip2_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vzip2q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
   return vzip2q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vzip2q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
   return vzip2q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
   return vzip2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
   return vzip2q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
   return vzip2_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
   return vzip2q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
   return vtrn1_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
   return vtrn1q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
   return vtrn1_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
   return vtrn1q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
   return vtrn1_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
   return vtrn1q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
   return vtrn1q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn1_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
   return vtrn1q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn1_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
   return vtrn1q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn1_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
   return vtrn1q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
   return vtrn1q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vtrn1_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
   return vtrn1_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vtrn1q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
   return vtrn1q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vtrn1q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
   return vtrn1q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn1_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
   return vtrn1q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn1_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
   return vtrn1q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
   return vtrn2_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_s8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
   return vtrn2q_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
   return vtrn2_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_s16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
   return vtrn2q_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
   return vtrn2_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_s32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
   return vtrn2q_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_s64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_s64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
   return vtrn2q_s64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn2_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_u8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
   return vtrn2q_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn2_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_u16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
   return vtrn2q_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
+//
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn2_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_u32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
+//
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
   return vtrn2q_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_u64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_u64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
   return vtrn2q_u64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vtrn2_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
+//
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
   return vtrn2_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_f32(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x float> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vtrn2q_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
   return vtrn2q_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_f64(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x double> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vtrn2q_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
+//
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
   return vtrn2q_f64(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
+//
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn2_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_p8(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
+//
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
   return vtrn2q_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
+//
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn2_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_p16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
+//
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
   return vtrn2q_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vuzp_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP1]]
+//
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vuzp_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP1]]
+//
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vuzp_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP1]]
+//
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vuzp_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP1]]
+//
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vuzp_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP1]]
+//
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vuzp_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP1]]
+//
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vuzp_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]]
+//
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vuzp_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP1]]
+//
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vuzp_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP1]]
+//
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vuzpq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP1]]
+//
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vuzpq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP1]]
+//
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vuzpq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP1]]
+//
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vuzpq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP1]]
+//
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vuzpq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP1]]
+//
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x i32> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vuzpq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP1]]
+//
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x float> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x float> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vuzpq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]]
+//
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK:   store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-// CHECK:   store <16 x i8> [[VUZP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vuzpq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP1]]
+//
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vuzpq_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x i16> [[VUZP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vuzpq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP1]]
+//
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vzip_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP1]]
+//
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vzip_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP1]]
+//
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vzip_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP1]]
+//
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vzip_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP1]]
+//
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vzip_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP1]]
+//
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vzip_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP1]]
+//
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vzip_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]]
+//
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vzip_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP1]]
+//
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzip_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vzip_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP1]]
+//
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vzipq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP1]]
+//
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vzipq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP1]]
+//
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vzipq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP1]]
+//
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vzipq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP1]]
+//
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vzipq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP1]]
+//
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vzipq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP1]]
+//
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x float> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vzipq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]]
+//
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK:   store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VZIP1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vzipq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP1]]
+//
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vzipq_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VZIP1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vzipq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP1]]
+//
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP1]]
+//
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vtrn_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP1]]
+//
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vtrn_s32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP1]]
+//
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vtrn_u8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP1]]
+//
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vtrn_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP1]]
+//
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x i32> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vtrn_u32(
+// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP1]]
+//
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   store <2 x float> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   store <2 x float> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
-// CHECK:   store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vtrn_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]]
+//
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
-// CHECK:   store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vtrn_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP1]]
+//
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vtrn_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP1]]
+//
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vtrnq_s8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP1]]
+//
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vtrnq_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP1]]
+//
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_s32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vtrnq_s32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP1]]
+//
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vtrnq_u8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP1]]
+//
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vtrnq_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP1]]
+//
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_u32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x i32> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vtrnq_u32(
+// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP1]]
+//
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_f32(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x float> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x float> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
-// CHECK:   store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vtrnq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]]
+//
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p8(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK:   store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK:   store <16 x i8> [[VTRN1_I]], ptr [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
-// CHECK:   store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
+// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vtrnq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP1]]
+//
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vtrnq_p16(
-// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]]
-// CHECK:   [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1
-// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x i16> [[VTRN1_I]], ptr [[TMP4]]
-// CHECK:   [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16
-// CHECK:   [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0
-// CHECK:   [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
-// CHECK:   store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16
-// CHECK:   [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP10]]
+// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vtrnq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP1]]
+//
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c
index 1d0db697e4fddd..38efeb2559fd4b 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c
@@ -1,17 +1,11 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
 // RUN: -ffp-exception-behavior=strict \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN: -ffp-exception-behavior=strict \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefix=CONSTRAINED %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -19,113 +13,162 @@
 
 #include <arm_neon.h>
 
-// COMMON-LABEL: test_vfmas_lane_f32
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// UNCONSTRAINED:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
-// CONSTRAINED:     [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float %b, float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmla s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}]
-// COMMONIR:        ret float [[TMP2]]
+// UNCONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]])
+// UNCONSTRAINED-NEXT:    ret float [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[B]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]]
+// CONSTRAINED-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
 }
 
-// COMMON-LABEL: test_vfmad_lane_f64
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0
-// UNCONSTRAINED:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CONSTRAINED:     [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        ret double [[TMP2]]
+// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
+// UNCONSTRAINED-NEXT:    ret double [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
 }
 
-// COMMON-LABEL: test_vfmad_laneq_f64
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1
-// UNCONSTRAINED:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CONSTRAINED:     [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
-// COMMONIR:        ret double [[TMP2]]
+// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64(
+// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
+// UNCONSTRAINED-NEXT:    ret double [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64(
+// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
 }
 
-// COMMON-LABEL: test_vfmss_lane_f32
-// COMMONIR:        [[SUB:%.*]] = fneg float %b
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// UNCONSTRAINED:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
-// CONSTRAINED:     [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[SUB]], float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmls s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}]
-// COMMONIR:        ret float [[TMP2]]
+// UNCONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32(
+// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg float [[B]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]])
+// UNCONSTRAINED-NEXT:    ret float [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32(
+// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg float [[B]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
 
-// COMMON-LABEL: test_vfma_lane_f64
-// COMMONIR:        [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:        [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// COMMONIR:        [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// COMMONIR:        [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// COMMONIR:        [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// COMMONIR:        [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// COMMONIR:        [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// UNCONSTRAINED:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CONSTRAINED:     [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        ret <1 x double> [[FMLA2]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
 }
 
-// COMMON-LABEL: test_vfms_lane_f64
-// COMMONIR:        [[SUB:%.*]] = fneg <1 x double> %b
-// COMMONIR:        [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:        [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// COMMONIR:        [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// COMMONIR:        [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// COMMONIR:        [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// COMMONIR:        [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// COMMONIR:        [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// UNCONSTRAINED:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CONSTRAINED:     [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        ret <1 x double> [[FMLA2]]
+
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]])
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
 
-// COMMON-LABEL: test_vfma_laneq_f64
-// COMMONIR:        [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:        [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// COMMONIR:        [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// COMMONIR:        [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// COMMONIR:        [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// COMMONIR:        [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// UNCONSTRAINED:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CONSTRAINED:     [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// COMMONIR:        ret <1 x double> [[TMP7]]
+
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP2:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]])
+// UNCONSTRAINED-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double>
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[TMP3]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CONSTRAINED-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// CONSTRAINED-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double>
+// CONSTRAINED-NEXT:    ret <1 x double> [[TMP3]]
+//
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
 }
 
-// COMMON-LABEL: test_vfms_laneq_f64
-// COMMONIR:        [[SUB:%.*]] = fneg <1 x double> %b
-// CHECK-ASM:       fneg d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// COMMONIR:        [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// COMMONIR:        [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// COMMONIR:        [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// COMMONIR:        [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// COMMONIR:        [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// UNCONSTRAINED:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CONSTRAINED:     [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// COMMONIR:        [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// COMMONIR:        ret <1 x double> [[TMP7]]
+// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
+// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// UNCONSTRAINED-NEXT:    [[TMP3:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]])
+// UNCONSTRAINED-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// UNCONSTRAINED-NEXT:    ret <1 x double> [[TMP4]]
+//
+// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
+// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CONSTRAINED-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CONSTRAINED-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// CONSTRAINED-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CONSTRAINED-NEXT:    ret <1 x double> [[TMP4]]
+//
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
 }
-
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 8b7b976ab5e5a0..1d6bc1eda9ad9a 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -1,419 +1,507 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
 
-// CHECK-LABEL: define{{.*}} float @test_vmuls_lane_f32(float noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
-// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
-// CHECK:   ret float [[MUL]]
+// CHECK-LABEL: define dso_local float @test_vmuls_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i64 1
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[A]], [[VGET_LANE]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
   return vmuls_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmuld_lane_f64(double noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
-// CHECK:   ret double [[MUL]]
+// CHECK-LABEL: define dso_local double @test_vmuld_lane_f64(
+// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A]], [[VGET_LANE]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
   return vmuld_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmuls_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
-// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
-// CHECK:   ret float [[MUL]]
+// CHECK-LABEL: define dso_local float @test_vmuls_laneq_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i64 3
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[A]], [[VGETQ_LANE]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
   return vmuls_laneq_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmuld_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
-// CHECK:   ret double [[MUL]]
+// CHECK-LABEL: define dso_local double @test_vmuld_laneq_f64(
+// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A]], [[VGETQ_LANE]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
   return vmuld_laneq_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmul_n_f64(<1 x double> noundef %a, double noundef %b) #0 {
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
-// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
-// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP4]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmul_n_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[B]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP2]]
+//
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
   return vmul_n_f64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmulxs_lane_f32(float noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
-// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]])
-// CHECK:   ret float [[VMULXS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vmulxs_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i64 1
+// CHECK-NEXT:    [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGET_LANE]])
+// CHECK-NEXT:    ret float [[VMULXS_F32_I]]
+//
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
   return vmulxs_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmulxs_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
-// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]])
-// CHECK:   ret float [[VMULXS_F32_I]]
+// CHECK-LABEL: define dso_local float @test_vmulxs_laneq_f32(
+// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i64 3
+// CHECK-NEXT:    [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGETQ_LANE]])
+// CHECK-NEXT:    ret float [[VMULXS_F32_I]]
+//
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
   return vmulxs_laneq_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmulxd_lane_f64(double noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]])
-// CHECK:   ret double [[VMULXD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vmulxd_lane_f64(
+// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGET_LANE]])
+// CHECK-NEXT:    ret double [[VMULXD_F64_I]]
+//
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
   return vmulxd_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmulxd_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]])
-// CHECK:   ret double [[VMULXD_F64_I]]
+// CHECK-LABEL: define dso_local double @test_vmulxd_laneq_f64(
+// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGETQ_LANE]])
+// CHECK-NEXT:    ret double [[VMULXD_F64_I]]
+//
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
   return vmulxd_laneq_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]])
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
 }
 
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_0(<1 x double> noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_0(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_1(<1 x double> noundef %a, <2 x double> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_1(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} float @test_vfmas_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
-// CHECK:   ret float [[TMP2]]
+// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vfmad_lane_f64(double noundef %a, double noundef %b, <1 x double> noundef %c) #0 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0
-// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CHECK:   ret double [[TMP2]]
+// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vfmad_laneq_f64(double noundef %a, double noundef %b, <2 x double> noundef %c) #0 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CHECK:   ret double [[TMP2]]
+// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64(
+// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vfmss_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 {
-// CHECK:   [[SUB:%.*]] = fneg float %b
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
-// CHECK:   ret float [[TMP2]]
+// CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
+// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK:   ret <1 x double> [[FMLA2]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 {
-// CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK:   ret <1 x double> [[FMLA2]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B]]
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP7]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP3]]
+//
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 {
-// CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP7]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP4]]
+//
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
-// CHECK:   ret i32 [[TMP4]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmullh_lane_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i64 [[VQDMULLS_S32_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmulls_lane_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGET_LANE]])
+// CHECK-NEXT:    ret i64 [[VQDMULLS_S32_I]]
+//
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulls_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
-// CHECK:   ret i32 [[TMP4]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmullh_laneq_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i64 [[VQDMULLS_S32_I]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmulls_laneq_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGETQ_LANE]])
+// CHECK-NEXT:    ret i64 [[VQDMULLS_S32_I]]
+//
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulls_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_lane_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i32 [[VQDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_lane_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGET_LANE]])
+// CHECK-NEXT:    ret i32 [[VQDMULHS_S32_I]]
+//
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulhs_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_laneq_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
 }
 
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i32 [[VQDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_laneq_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]])
+// CHECK-NEXT:    ret i32 [[VQDMULHS_S32_I]]
+//
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulhs_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_lane_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_lane_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGET_LANE]])
+// CHECK-NEXT:    ret i32 [[VQRDMULHS_S32_I]]
+//
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqrdmulhs_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_laneq_s16(
+// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
 }
 
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
+// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_laneq_s32(
+// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]])
+// CHECK-NEXT:    ret i32 [[VQRDMULHS_S32_I]]
+//
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqrdmulhs_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_lane_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlals_lane_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <2 x i32> [[C]], i64 1
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlals_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_laneq_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlals_laneq_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i32> [[C]], i64 3
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlals_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_lane_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_lane_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <2 x i32> [[C]], i64 1
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlsls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_laneq_s16(
+// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_laneq_s32(
+// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i32> [[C]], i64 3
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlsls_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64_0() #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double 0x3FD6304BC43AB5C2, double 0x3FEE211E215AEEF3)
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_lane_f64_0() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -425,15 +513,13 @@ float64x1_t test_vmulx_lane_f64_0() {
       return result;
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_2() #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double 0x3FD6304BC43AB5C2, double 0x3FEE211E215AEEF3)
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_2() {
       float64x1_t arg1;
       float64x1_t arg2;
diff --git a/clang/test/CodeGen/AArch64/poly-add.c b/clang/test/CodeGen/AArch64/poly-add.c
index 0795aecac433f2..d408e919639b28 100644
--- a/clang/test/CodeGen/AArch64/poly-add.c
+++ b/clang/test/CodeGen/AArch64/poly-add.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
 // RUN:  | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
@@ -18,11 +18,8 @@ poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) {
 
 // CHECK-LABEL: @test_vadd_p16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
   return vadd_p16 (a, b);
@@ -30,11 +27,8 @@ poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
 
 // CHECK-LABEL: @test_vadd_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) {
   return vadd_p64(a, b);
@@ -51,11 +45,8 @@ poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){
 
 // CHECK-LABEL: @test_vaddq_p16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
   return vaddq_p16(a, b);
@@ -63,11 +54,8 @@ poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
 
 // CHECK-LABEL: @test_vaddq_p64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
   return vaddq_p64(a, b);
@@ -75,11 +63,8 @@ poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
 
 // CHECK-LABEL: @test_vaddq_p128(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-// CHECK-NEXT:    ret i128 [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor i128 [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret i128 [[TMP0]]
 //
 poly128_t test_vaddq_p128 (poly128_t a, poly128_t b){
   return vaddq_p128(a, b);
diff --git a/clang/test/CodeGen/AArch64/poly128.c b/clang/test/CodeGen/AArch64/poly128.c
index f188632468fc83..be683b17e0d0b8 100644
--- a/clang/test/CodeGen/AArch64/poly128.c
+++ b/clang/test/CodeGen/AArch64/poly128.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg \
+// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
 // RUN:  | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
@@ -40,7 +40,7 @@ poly128_t test_vldrq_p128(poly128_t * ptr) {
 // CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[PTR]], align 16
-// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i128, ptr [[PTR]], i64 1
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 16
 // CHECK-NEXT:    store i128 [[TMP0]], ptr [[ADD_PTR]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -61,13 +61,11 @@ __attribute__((target("aes"))) poly128_t test_vmull_p64(poly64_t a, poly64_t b)
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmull_high_p64
-// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I5:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> [[B]], <1 x i32> <i32 1>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to i64
-// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]])
+// CHECK-NEXT:    [[SHUFFLE_I5_EXTRACT:%.*]] = extractelement <2 x i64> [[A]], i64 1
+// CHECK-NEXT:    [[SHUFFLE_I_EXTRACT:%.*]] = extractelement <2 x i64> [[B]], i64 1
+// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[SHUFFLE_I5_EXTRACT]], i64 [[SHUFFLE_I_EXTRACT]])
 // CHECK-NEXT:    [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
 // CHECK-NEXT:    ret i128 [[VMULL_P641_I_I]]
 //
@@ -76,7 +74,7 @@ __attribute__((target("aes"))) poly128_t test_vmull_high_p64(poly64x2_t a, poly6
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -86,7 +84,7 @@ poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -96,7 +94,7 @@ poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -106,7 +104,7 @@ poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s64
-// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -116,7 +114,7 @@ poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -126,7 +124,7 @@ poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -136,7 +134,7 @@ poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u32
-// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -146,7 +144,7 @@ poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u64
-// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -156,7 +154,7 @@ poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -166,7 +164,7 @@ poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f64
-// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -176,7 +174,7 @@ poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p8
-// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -186,7 +184,7 @@ poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p16
-// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -196,7 +194,7 @@ poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p64
-// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
 // CHECK-NEXT:    ret i128 [[TMP0]]
@@ -206,7 +204,7 @@ poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s8_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
@@ -216,7 +214,7 @@ int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s16_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
@@ -226,7 +224,7 @@ int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s32_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -236,7 +234,7 @@ int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s64_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
@@ -246,7 +244,7 @@ int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u8_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
@@ -256,7 +254,7 @@ uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u16_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
@@ -266,7 +264,7 @@ uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u32_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
@@ -276,7 +274,7 @@ uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u64_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
@@ -286,7 +284,7 @@ uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f32_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[TMP0]]
@@ -296,7 +294,7 @@ float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f64_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x double>
 // CHECK-NEXT:    ret <2 x double> [[TMP0]]
@@ -306,7 +304,7 @@ float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p8_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
@@ -316,7 +314,7 @@ poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p16_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[TMP0]]
@@ -326,7 +324,7 @@ poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p64_p128
-// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[TMP0]]
diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c
index f3c057ecf48c17..558833469d3867 100644
--- a/clang/test/CodeGen/AArch64/poly64.c
+++ b/clang/test/CodeGen/AArch64/poly64.c
@@ -1,537 +1,717 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \
+// RUN:  -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
 // RUN:  | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vceq_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[SEXT_I]]
+//
 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
   return vceq_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vceqq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
-// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[SEXT_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[SEXT_I]]
+//
 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
   return vceqq_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vtst_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP4:%.*]] = and <1 x i64> %a, %b
-// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
-// CHECK:   ret <1 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[VTST_I]]
+//
 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
   return vtst_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtstq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP4:%.*]] = and <2 x i64> %a, %b
-// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
-// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
-// CHECK:   ret <2 x i64> [[VTST_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[VTST_I]]
+//
 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
   return vtstq_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vbsl_p64(<1 x i64> noundef %a, <1 x i64> noundef %b, <1 x i64> noundef %c) #0 {
-// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> %a, %b
-// CHECK:   [[TMP3:%.*]] = xor <1 x i64> %a, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %c
-// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <1 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[A]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[C]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <1 x i64> [[VBSL5_I]]
+//
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
   return vbsl_p64(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vbslq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c) #0 {
-// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> %a, %b
-// CHECK:   [[TMP3:%.*]] = xor <2 x i64> %a, splat (i64 -1)
-// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %c
-// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK:   ret <2 x i64> [[VBSL5_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[A]], splat (i64 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[C]], [[TMP0]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    ret <2 x i64> [[VBSL5_I]]
+//
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
   return vbslq_p64(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vget_lane_p64(<1 x i64> noundef %v) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %v, i32 0
-// CHECK:   ret i64 [[VGET_LANE]]
+// CHECK-LABEL: define dso_local i64 @test_vget_lane_p64(
+// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[V]], i64 0
+// CHECK-NEXT:    ret i64 [[VGET_LANE]]
+//
 poly64_t test_vget_lane_p64(poly64x1_t v) {
   return vget_lane_p64(v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vgetq_lane_p64(<2 x i64> noundef %v) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %v, i32 1
-// CHECK:   ret i64 [[VGETQ_LANE]]
+// CHECK-LABEL: define dso_local i64 @test_vgetq_lane_p64(
+// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[V]], i64 1
+// CHECK-NEXT:    ret i64 [[VGETQ_LANE]]
+//
 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
   return vgetq_lane_p64(v, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vset_lane_p64(i64 noundef %a, <1 x i64> noundef %v) #0 {
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %v, i64 %a, i32 0
-// CHECK:   ret <1 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vset_lane_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VSET_LANE]]
+//
 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
   return vset_lane_p64(a, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsetq_lane_p64(i64 noundef %a, <2 x i64> noundef %v) #0 {
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %v, i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsetq_lane_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[V]], i64 [[A]], i64 1
+// CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
+//
 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
   return vsetq_lane_p64(a, v, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcopy_lane_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> %a, i64 [[VGET_LANE]], i32 0
-// CHECK:   ret <1 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcopy_lane_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[B]]
+//
 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
   return vcopy_lane_p64(a, 0, b, 0);
 
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_lane_p64(<2 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGET_LANE]], i32 1
-// CHECK:   ret <2 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_lane_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <1 x i64> [[B]], <1 x i64> poison, <2 x i32> <i32 0, i32 poison>
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[TMP0]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
+//
 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
   return vcopyq_lane_p64(a, 1, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %b, i32 1
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGETQ_LANE]], i32 1
-// CHECK:   ret <2 x i64> [[VSET_LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_laneq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
+//
 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
   return vcopyq_laneq_p64(a, 1, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcreate_p64(i64 noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
-// CHECK:   ret <1 x i64> [[TMP0]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vcreate_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vcreate_p64(uint64_t a) {
   return vcreate_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_n_p64(i64 noundef %a) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   ret <1 x i64> [[VECINIT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_n_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VECINIT_I]]
+//
 poly64x1_t test_vdup_n_p64(poly64_t a) {
   return vdup_n_p64(a);
 }
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_n_p64(i64 noundef %a) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VECINIT1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_n_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
+//
 poly64x2_t test_vdupq_n_p64(poly64_t a) {
   return vdupq_n_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vmov_n_p64(i64 noundef %a) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0
-// CHECK:   ret <1 x i64> [[VECINIT_I]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vmov_n_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[VECINIT_I]]
+//
 poly64x1_t test_vmov_n_p64(poly64_t a) {
   return vmov_n_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vmovq_n_p64(i64 noundef %a) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
-// CHECK:   ret <2 x i64> [[VECINIT1_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vmovq_n_p64(
+// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
+//
 poly64x2_t test_vmovq_n_p64(poly64_t a) {
   return vmovq_n_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_lane_p64(<1 x i64> noundef %vec) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK:    ret <1 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_lane_p64(
+// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[VEC]]
+//
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
   return vdup_lane_p64(vec, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_lane_p64(<1 x i64> noundef %vec) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:    ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_lane_p64(
+// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[VEC]], <1 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
   return vdupq_lane_p64(vec, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_laneq_p64(<2 x i64> noundef %vec) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:    ret <2 x i64> [[LANE]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_laneq_p64(
+// CHECK-SAME: <2 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[VEC]], <2 x i64> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
   return vdupq_laneq_p64(vec, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcombine_p64(<1 x i64> noundef %low, <1 x i64> noundef %high) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vcombine_p64(
+// CHECK-SAME: <1 x i64> noundef [[LOW:%.*]], <1 x i64> noundef [[HIGH:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[LOW]], <1 x i64> [[HIGH]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
   return vcombine_p64(low, high);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[TMP2:%.*]] = load <1 x i64>, ptr %ptr
-// CHECK:   ret <1 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[PTR]], align 8
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
   return vld1_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[TMP2:%.*]] = load <2 x i64>, ptr %ptr
-// CHECK:   ret <2 x i64> [[TMP2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR]], align 8
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
   return vld1q_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_p64(ptr noundef %ptr, <1 x i64> noundef %val) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   store <1 x i64> [[TMP3]], ptr %ptr
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <1 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <1 x i64> [[VAL]], ptr [[PTR]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
   return vst1_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_p64(ptr noundef %ptr, <2 x i64> noundef %val) #0 {
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   store <2 x i64> [[TMP3]], ptr %ptr
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst1q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <2 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <2 x i64> [[VAL]], ptr [[PTR]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
   return vst1q_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %ptr)
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP1]]
+//
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
   return vld2_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %ptr)
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16
+// CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP1]]
+//
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
   return vld2q_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %ptr)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP2]]
+//
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
   return vld3_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %ptr)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP2]]
+//
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
   return vld3q_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %ptr)
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP3]]
+//
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
   return vld4_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_p64(ptr noundef %ptr) #0 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %ptr)
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]]
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
+// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[PTR]])
+// CHECK-NEXT:    [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16
+// CHECK-NEXT:    [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16
+// CHECK-NEXT:    [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16
+// CHECK-NEXT:    [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32
+// CHECK-NEXT:    [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16
+// CHECK-NEXT:    [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48
+// CHECK-NEXT:    [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3
+// CHECK-NEXT:    store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false)
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0
+// CHECK-NEXT:    [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1
+// CHECK-NEXT:    [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2
+// CHECK-NEXT:    [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48
+// CHECK-NEXT:    [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16
+// CHECK-NEXT:    [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP3]]
+//
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
   return vld4q_p64(ptr);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_p64(ptr noundef %ptr, [2 x <1 x i64>] alignstack(8) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 16, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[VAL]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
   return vst2_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_p64(ptr noundef %ptr, [2 x <2 x i64>] alignstack(16) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 32, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst2q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[VAL]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
   return vst2q_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_p64(ptr noundef %ptr, [3 x <1 x i64>] alignstack(8) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 24, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8
+// CHECK-NEXT:    [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[VAL]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
   return vst3_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_p64(ptr noundef %ptr, [3 x <2 x i64>] alignstack(16) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 48, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst3q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16
+// CHECK-NEXT:    [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 32
+// CHECK-NEXT:    [[VAL_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[VAL]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
   return vst3q_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_p64(ptr noundef %ptr, [4 x <1 x i64>] alignstack(8) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 32, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL6]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX7]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8
+// CHECK-NEXT:    [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 8
+// CHECK-NEXT:    [[VAL_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 24
+// CHECK-NEXT:    [[VAL_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    store <1 x i64> [[VAL_COERCE_ELT6]], ptr [[VAL_REPACK5]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[VAL]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8
+// CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX7]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
   return vst4_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_p64(ptr noundef %ptr, [4 x <2 x i64>] alignstack(16) %val.coerce) #0 {
-// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[VAL]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16
-// CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 64, i1 false)
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL6]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX7]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %ptr)
-// CHECK:   ret void
+// CHECK-LABEL: define dso_local void @test_vst4q_p64(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16
+// CHECK-NEXT:    [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16
+// CHECK-NEXT:    [[VAL_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16
+// CHECK-NEXT:    [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 32
+// CHECK-NEXT:    [[VAL_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 16
+// CHECK-NEXT:    [[VAL_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 48
+// CHECK-NEXT:    [[VAL_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    store <2 x i64> [[VAL_COERCE_ELT6]], ptr [[VAL_REPACK5]], align 16
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[VAL]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16
+// CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX7]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
   return vst4q_p64(ptr, val);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vext_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
+//
 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
   return vext_u64(a, b, 0);
 
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
-// CHECK:   ret <2 x i64> [[VEXT]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[VEXT]]
+//
 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
   return vextq_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
   return vzip1q_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
   return vzip2q_u64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
   return vuzp1q_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
   return vuzp2q_u64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
   return vtrn1q_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
-// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
   return vtrn2q_u64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vsri_n_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33)
-// CHECK:   ret <1 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 33)
+// CHECK-NEXT:    ret <1 x i64> [[VSRI_N2]]
+//
 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
   return vsri_n_p64(a, b, 33);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsriq_n_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64)
-// CHECK:   ret <2 x i64> [[VSRI_N2]]
+// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 64)
+// CHECK-NEXT:    ret <2 x i64> [[VSRI_N2]]
+//
 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
   return vsriq_n_p64(a, b, 64);
 }
diff --git a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c
index bc985efa6bc99b..d98a7da3e35874 100644
--- a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,dce -S | FileCheck %s
+// RUN:  -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,instcombine,dce -S | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -8,11 +8,9 @@
 
 // CHECK-LABEL: @test_vqrdmlah_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMLAH_S163_I]]
 //
 int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vqrdmlah_laneq_s16(a, b, v, 7);
@@ -20,11 +18,9 @@ int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmlah_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMLAH_S323_I]]
 //
 int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vqrdmlah_laneq_s32(a, b, v, 3);
@@ -32,11 +28,9 @@ int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmlahq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMLAHQ_S163_I]]
 //
 int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vqrdmlahq_laneq_s16(a, b, v, 7);
@@ -44,11 +38,9 @@ int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmlahq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMLAHQ_S323_I]]
 //
 int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vqrdmlahq_laneq_s32(a, b, v, 3);
@@ -78,10 +70,9 @@ int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vqrdmlahh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
 // CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
@@ -92,7 +83,7 @@ int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
 
 // CHECK-LABEL: @test_vqrdmlahs_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i64 1
 // CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLAHS_S32_I]]
 //
@@ -102,10 +93,9 @@ int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
 
 // CHECK-LABEL: @test_vqrdmlahh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[C:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
 // CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
@@ -116,7 +106,7 @@ int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
 
 // CHECK-LABEL: @test_vqrdmlahs_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 3
 // CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLAHS_S32_I]]
 //
@@ -126,11 +116,9 @@ int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
 
 // CHECK-LABEL: @test_vqrdmlsh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-NEXT:    ret <4 x i16> [[VQRDMLSH_S163_I]]
 //
 int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
   return vqrdmlsh_laneq_s16(a, b, v, 7);
@@ -138,11 +126,9 @@ int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmlsh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-NEXT:    ret <2 x i32> [[VQRDMLSH_S323_I]]
 //
 int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
   return vqrdmlsh_laneq_s32(a, b, v, 3);
@@ -150,11 +136,9 @@ int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 
 // CHECK-LABEL: @test_vqrdmlshq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-NEXT:    ret <8 x i16> [[VQRDMLSHQ_S163_I]]
 //
 int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
   return vqrdmlshq_laneq_s16(a, b, v, 7);
@@ -162,11 +146,9 @@ int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 
 // CHECK-LABEL: @test_vqrdmlshq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-NEXT:    ret <4 x i32> [[VQRDMLSHQ_S323_I]]
 //
 int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
   return vqrdmlshq_laneq_s32(a, b, v, 3);
@@ -196,10 +178,9 @@ int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
 
 // CHECK-LABEL: @test_vqrdmlshh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
 // CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
@@ -210,7 +191,7 @@ int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
 
 // CHECK-LABEL: @test_vqrdmlshs_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i64 1
 // CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLSHS_S32_I]]
 //
@@ -220,10 +201,9 @@ int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
 
 // CHECK-LABEL: @test_vqrdmlshh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0
-// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[C:%.*]], <8 x i16> poison, <4 x i32> <i32 7, i32 poison, i32 poison, i32 poison>
 // CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
@@ -234,7 +214,7 @@ int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
 
 // CHECK-LABEL: @test_vqrdmlshs_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 3
 // CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLSHS_S32_I]]
 //
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c
index b51e6f7e6e1ac3..13456c08ce9d67 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c
@@ -1,21 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s
+// RUN: | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | llc -o=- - \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \
-// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | llc -o=- - \
-// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM --implicit-check-not=fpexcept.maytrap  %s
+// RUN: | opt -S -passes=mem2reg,instcombine \
+// RUN: | FileCheck --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -29,310 +21,418 @@
 
 #include <arm_neon.h>
 
-// COMMON-LABEL: test_vsqrt_f16
-// UNCONSTRAINED:  [[SQR:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
-// CONSTRAINED:    [[SQR:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fsqrt v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-// COMMONIR:       ret <4 x half> [[SQR]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[VSQRT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[VSQRT_I]]
+//
 float16x4_t test_vsqrt_f16(float16x4_t a) {
   return vsqrt_f16(a);
 }
 
-// COMMON-LABEL: test_vsqrtq_f16
-// UNCONSTRAINED:  [[SQR:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
-// CONSTRAINED:    [[SQR:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fsqrt v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-// COMMONIR:       ret <8 x half> [[SQR]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[VSQRT_I]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[VSQRT_I]]
+//
 float16x8_t test_vsqrtq_f16(float16x8_t a) {
   return vsqrtq_f16(a);
 }
 
-// COMMON-LABEL: test_vfma_f16
-// UNCONSTRAINED:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
-// CONSTRAINED:    [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-// COMMONIR:       ret <4 x half> [[ADD]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmaq_f16
-// UNCONSTRAINED:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
-// CONSTRAINED:    [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-// COMMONIR:       ret <8 x half> [[ADD]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfms_f16
-// COMMONIR:       [[SUB:%.*]] = fneg <4 x half> %b
-// UNCONSTRAINED:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a)
-// CONSTRAINED:    [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-// COMMONIR:       ret <4 x half> [[ADD]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[B]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmsq_f16
-// COMMONIR:       [[SUB:%.*]] = fneg <8 x half> %b
-// UNCONSTRAINED:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a)
-// CONSTRAINED:    [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:      fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-// COMMONIR:       ret <8 x half> [[ADD]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[B]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfma_lane_f16
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[FMLA2]]
+//
 float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfmaq_lane_f16
-// COMMONIR:      [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[FMLA2]]
+//
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfma_laneq_f16
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfma_laneq_f16(a, b, c, 7);
 }
 
-// COMMON-LABEL: test_vfmaq_laneq_f16
-// COMMONIR:      [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_laneq_f16(a, b, c, 7);
 }
 
-// COMMON-LABEL: test_vfma_n_f16
-// COMMONIR:      [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0
-// COMMONIR:      [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// COMMONIR:      [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// COMMONIR:      [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// UNCONSTRAINED: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// UNCONSTRAINED-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// CONSTRAINED-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfma_n_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmaq_n_f16
-// COMMONIR:      [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0
-// COMMONIR:      [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// COMMONIR:      [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// COMMONIR:      [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// COMMONIR:      [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// COMMONIR:      [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// COMMONIR:      [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// COMMONIR:      [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// UNCONSTRAINED: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// UNCONSTRAINED-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// CONSTRAINED-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmah_lane_f16
-// COMMONIR:      [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
-// UNCONSTRAINED: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret half [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16(
+// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
+// UNCONSTRAINED-NEXT:    ret half [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16(
+// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret half [[TMP0]]
+//
 float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
   return vfmah_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfmah_laneq_f16
-// COMMONIR:      [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
-// UNCONSTRAINED: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret half [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16(
+// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
+// UNCONSTRAINED-NEXT:    ret half [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16(
+// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret half [[TMP0]]
+//
 float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
   return vfmah_laneq_f16(a, b, c, 7);
 }
 
-// COMMON-LABEL: test_vfms_lane_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <4 x half> %b
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CONSTRAINED:   [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[FMLA2]]
+//
 float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfmsq_lane_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <8 x half> %b
-// COMMONIR:      [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// UNCONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[FMLA2]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CONSTRAINED-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[FMLA2]]
+//
 float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmsq_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfms_laneq_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <4 x half> %b
-// CHECK-ASM-NOT: fneg
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfms_laneq_f16(a, b, c, 7);
 }
 
-// COMMON-LABEL: test_vfmsq_laneq_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <8 x half> %b
-// CHECK-ASM-NOT: fneg
-// COMMONIR:      [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// COMMONIR:      [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// COMMONIR:      [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// COMMONIR:      [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// COMMONIR:      [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// COMMONIR:      [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMLA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CONSTRAINED-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_laneq_f16(a, b, c, 7);
 }
 
-// COMMON-LABEL: test_vfms_n_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <4 x half> %b
-// COMMONIR:      [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0
-// COMMONIR:      [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// COMMONIR:      [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// COMMONIR:      [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// UNCONSTRAINED: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <4 x half> [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16(
+// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// UNCONSTRAINED-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16(
+// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// CONSTRAINED-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfms_n_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmsq_n_f16
-// COMMONIR:      [[SUB:%.*]]  = fneg <8 x half> %b
-// COMMONIR:      [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0
-// COMMONIR:      [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// COMMONIR:      [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// COMMONIR:      [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// COMMONIR:      [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// COMMONIR:      [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// COMMONIR:      [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// COMMONIR:      [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// UNCONSTRAINED: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret <8 x half> [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16(
+// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// UNCONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// UNCONSTRAINED-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
+// UNCONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16(
+// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CONSTRAINED-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// CONSTRAINED-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmsq_n_f16(a, b, c);
 }
 
-// COMMON-LABEL: test_vfmsh_lane_f16
-// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float
-// CONSTRAINED:   [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict")
-// CHECK-ASM:     fcvt s{{[0-9]+}}, h{{[0-9]+}}
-// COMMONIR:      [[TMP1:%.*]] = fneg float [[TMP0]]
-// CHECK-ASM:     fneg s{{[0-9]+}}, s{{[0-9]+}}
-// UNCONSTRAINED: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CONSTRAINED:   [[SUB:%.*]]  = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fcvt h{{[0-9]+}}, s{{[0-9]+}}
-// COMMONIR:      [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
-// UNCONSTRAINED: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret half [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16(
+// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fneg half [[B]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
+// UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
+// UNCONSTRAINED-NEXT:    ret half [[TMP1]]
+//
+// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16(
+// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
+// CONSTRAINED-NEXT:    [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret half [[TMP1]]
+//
 float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
   return vfmsh_lane_f16(a, b, c, 3);
 }
 
-// COMMON-LABEL: test_vfmsh_laneq_f16
-// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float
-// CONSTRAINED:   [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict")
-// CHECK-ASM:     fcvt s{{[0-9]+}}, h{{[0-9]+}}
-// COMMONIR:      [[TMP1:%.*]] = fneg float [[TMP0]]
-// CHECK-ASM:     fneg s{{[0-9]+}}, s{{[0-9]+}}
-// UNCONSTRAINED: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CONSTRAINED:   [[SUB:%.*]]  = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fcvt h{{[0-9]+}}, s{{[0-9]+}}
-// COMMONIR:      [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
-// UNCONSTRAINED: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CONSTRAINED:   [[FMA:%.*]]  = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}]
-// COMMONIR:      ret half [[FMA]]
+// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16(
+// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = fneg half [[B]]
+// UNCONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
+// UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
+// UNCONSTRAINED-NEXT:    ret half [[TMP1]]
+//
+// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16(
+// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-NEXT:    [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
+// CONSTRAINED-NEXT:    [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
+// CONSTRAINED-NEXT:    [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-NEXT:    ret half [[TMP1]]
+//
 float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
   return vfmsh_laneq_f16(a, b, c, 7);
 }
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
index 4d2ef318005bd3..b616664e395c05 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature -fullfp16 -target-feature +v8a\
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
@@ -15,17 +15,14 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
 // CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP1]]
 //
 float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
   return vbsl_f16(a, b, c);
@@ -34,17 +31,14 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
 // CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]]
-// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
-// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
 //
 float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
   return vbslq_f16(a, b, c);
@@ -53,21 +47,12 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vzip_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]]
 //
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
   return vzip_f16(a, b);
@@ -76,21 +61,12 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VZIP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VZIP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]]
 //
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
   return vzipq_f16(a, b);
@@ -99,21 +75,12 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]]
 //
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
   return vuzp_f16(a, b);
@@ -122,21 +89,12 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VUZP_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VUZP1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]]
 //
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
   return vuzpq_f16(a, b);
@@ -145,21 +103,12 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]]
 //
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
   return vtrn_f16(a, b);
@@ -168,21 +117,12 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0
-// CHECK-NEXT:    store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VTRN_I]], 0
+// CHECK-NEXT:    [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VTRN1_I]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]]
 //
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
   return vtrnq_f16(a, b);
@@ -191,10 +131,8 @@ float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16
 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vmov_n_f16(float16_t a) {
@@ -204,14 +142,8 @@ float16x4_t test_vmov_n_f16(float16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16
 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vmovq_n_f16(float16_t a) {
@@ -221,10 +153,8 @@ float16x8_t test_vmovq_n_f16(float16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16
 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vdup_n_f16(float16_t a) {
@@ -234,14 +164,8 @@ float16x4_t test_vdup_n_f16(float16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16
 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vdupq_n_f16(float16_t a) {
@@ -251,9 +175,7 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <4 x half> [[LANE]]
 //
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
@@ -263,9 +185,7 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <8 x half> [[LANE]]
 //
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
@@ -275,9 +195,7 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x half> [[LANE]]
 //
 float16x4_t test_vdup_laneq_f16(float16x8_t a) {
@@ -287,9 +205,7 @@ float16x4_t test_vdup_laneq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x half> [[LANE]]
 //
 float16x8_t test_vdupq_laneq_f16(float16x8_t a) {
@@ -299,11 +215,7 @@ float16x8_t test_vdupq_laneq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vext_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 // CHECK-NEXT:    ret <4 x half> [[VEXT]]
 //
 float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
@@ -313,11 +225,7 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vextq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
 // CHECK-NEXT:    ret <8 x half> [[VEXT]]
 //
 float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
@@ -327,7 +235,7 @@ float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrev64_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
 //
 float16x4_t test_vrev64_f16(float16x4_t a) {
@@ -337,7 +245,7 @@ float16x4_t test_vrev64_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrev64q_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
 //
 float16x8_t test_vrev64q_f16(float16x8_t a) {
@@ -467,7 +375,7 @@ float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vduph_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i32 7
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i64 7
 // CHECK-NEXT:    ret half [[VGETQ_LANE]]
 //
 float16_t test_vduph_laneq_f16(float16x8_t vec) {
@@ -477,7 +385,7 @@ float16_t test_vduph_laneq_f16(float16x8_t vec) {
 // CHECK-LABEL: define {{[^@]+}}@test_vduph_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i64 3
 // CHECK-NEXT:    ret half [[VGET_LANE]]
 //
 float16_t test_vduph_lane_f16(float16x4_t vec) {
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 1cce977b60e6b4..ddde409637d58c 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
@@ -11,7 +11,6 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vabs_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VABS1_I]]
 //
@@ -22,7 +21,6 @@ float16x4_t test_vabs_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vabsq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VABS1_I]]
 //
@@ -33,9 +31,8 @@ float16x8_t test_vabsq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vceqz_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
 //
 uint16x4_t test_vceqz_f16(float16x4_t a) {
@@ -45,9 +42,8 @@ uint16x4_t test_vceqz_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vceqzq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
 //
 uint16x8_t test_vceqzq_f16(float16x8_t a) {
@@ -57,9 +53,8 @@ uint16x8_t test_vceqzq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcgez_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VCGEZ_I]]
 //
 uint16x4_t test_vcgez_f16(float16x4_t a) {
@@ -69,9 +64,8 @@ uint16x4_t test_vcgez_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcgezq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VCGEZ_I]]
 //
 uint16x8_t test_vcgezq_f16(float16x8_t a) {
@@ -81,9 +75,8 @@ uint16x8_t test_vcgezq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcgtz_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VCGTZ_I]]
 //
 uint16x4_t test_vcgtz_f16(float16x4_t a) {
@@ -93,9 +86,8 @@ uint16x4_t test_vcgtz_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcgtzq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VCGTZ_I]]
 //
 uint16x8_t test_vcgtzq_f16(float16x8_t a) {
@@ -105,9 +97,8 @@ uint16x8_t test_vcgtzq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vclez_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VCLEZ_I]]
 //
 uint16x4_t test_vclez_f16(float16x4_t a) {
@@ -117,9 +108,8 @@ uint16x4_t test_vclez_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vclezq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VCLEZ_I]]
 //
 uint16x8_t test_vclezq_f16(float16x8_t a) {
@@ -129,9 +119,8 @@ uint16x8_t test_vclezq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcltz_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VCLTZ_I]]
 //
 uint16x4_t test_vcltz_f16(float16x4_t a) {
@@ -141,9 +130,8 @@ uint16x4_t test_vcltz_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcltzq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer
-// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VCLTZ_I]]
 //
 uint16x8_t test_vcltzq_f16(float16x8_t a) {
@@ -153,7 +141,6 @@ uint16x8_t test_vcltzq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half>
 // CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
 //
@@ -164,7 +151,6 @@ float16x4_t test_vcvt_f16_s16 (int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half>
 // CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
 //
@@ -175,7 +161,6 @@ float16x8_t test_vcvtq_f16_s16 (int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half>
 // CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
 //
@@ -186,7 +171,6 @@ float16x4_t test_vcvt_f16_u16 (uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half>
 // CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
 //
@@ -197,7 +181,6 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
 //
@@ -208,7 +191,6 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
 //
@@ -219,7 +201,6 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
 //
@@ -230,7 +211,6 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_u16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
 //
@@ -241,7 +221,6 @@ uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
 //
@@ -252,7 +231,6 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
 //
@@ -263,7 +241,6 @@ uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTA1_I]]
 //
@@ -274,7 +251,6 @@ int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
 //
@@ -285,7 +261,6 @@ int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
 //
@@ -296,7 +271,6 @@ int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
 //
@@ -307,7 +281,6 @@ uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
 //
@@ -318,7 +291,6 @@ uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
 //
@@ -329,7 +301,6 @@ int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
 //
@@ -340,7 +311,6 @@ int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
 //
@@ -351,7 +321,6 @@ uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
 //
@@ -362,7 +331,6 @@ uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
 //
@@ -373,7 +341,6 @@ int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
 //
@@ -384,7 +351,6 @@ int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
 //
@@ -395,7 +361,6 @@ uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
 //
@@ -427,7 +392,6 @@ float16x8_t test_vnegq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrecpe_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRECPE_V1_I]]
 //
@@ -438,7 +402,6 @@ float16x4_t test_vrecpe_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrecpeq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRECPEQ_V1_I]]
 //
@@ -449,7 +412,6 @@ float16x8_t test_vrecpeq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrnd_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDZ1_I]]
 //
@@ -460,7 +422,6 @@ float16x4_t test_vrnd_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDZ1_I]]
 //
@@ -471,7 +432,6 @@ float16x8_t test_vrndq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrnda_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDA1_I]]
 //
@@ -482,7 +442,6 @@ float16x4_t test_vrnda_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndaq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDA1_I]]
 //
@@ -493,7 +452,6 @@ float16x8_t test_vrndaq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndi_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDI_V1_I]]
 //
@@ -504,7 +462,6 @@ float16x4_t test_vrndi_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndiq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDIQ_V1_I]]
 //
@@ -515,7 +472,6 @@ float16x8_t test_vrndiq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndm_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDM1_I]]
 //
@@ -526,7 +482,6 @@ float16x4_t test_vrndm_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndmq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDM1_I]]
 //
@@ -537,7 +492,6 @@ float16x8_t test_vrndmq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndn_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDN1_I]]
 //
@@ -548,7 +502,6 @@ float16x4_t test_vrndn_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndnq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDN1_I]]
 //
@@ -559,7 +512,6 @@ float16x8_t test_vrndnq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndp_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDP1_I]]
 //
@@ -570,7 +522,6 @@ float16x4_t test_vrndp_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndpq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDP1_I]]
 //
@@ -581,7 +532,6 @@ float16x8_t test_vrndpq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndx_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDX1_I]]
 //
@@ -592,7 +542,6 @@ float16x4_t test_vrndx_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrndxq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDX1_I]]
 //
@@ -603,7 +552,6 @@ float16x8_t test_vrndxq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrsqrte_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRSQRTE_V1_I]]
 //
@@ -614,7 +562,6 @@ float16x4_t test_vrsqrte_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrsqrteq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRSQRTEQ_V1_I]]
 //
@@ -625,7 +572,6 @@ float16x8_t test_vrsqrteq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vsqrt_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VSQRT_I]]
 //
@@ -636,7 +582,6 @@ float16x4_t test_vsqrt_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vsqrtq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VSQRT_I]]
 //
@@ -667,8 +612,6 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vabd_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VABD2_I]]
 //
@@ -679,8 +622,6 @@ float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vabdq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VABD2_I]]
 //
@@ -691,8 +632,6 @@ float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcage_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VCAGE_V2_I]]
 //
@@ -703,8 +642,6 @@ uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcageq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VCAGEQ_V2_I]]
 //
@@ -715,8 +652,6 @@ uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcagt_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VCAGT_V2_I]]
 //
@@ -727,8 +662,6 @@ uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcagtq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VCAGTQ_V2_I]]
 //
@@ -739,8 +672,6 @@ uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcale_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCALE_V2_I]]
 //
@@ -751,8 +682,6 @@ uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcaleq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCALEQ_V2_I]]
 //
@@ -763,8 +692,6 @@ uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcalt_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCALT_V2_I]]
 //
@@ -775,8 +702,6 @@ uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcaltq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCALTQ_V2_I]]
 //
@@ -897,9 +822,7 @@ uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2)
 // CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
 //
 float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
@@ -909,9 +832,7 @@ float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2)
 // CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
 //
 float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
@@ -921,9 +842,7 @@ float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2)
 // CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
 //
 float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
@@ -933,9 +852,7 @@ float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2)
 // CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
 //
 float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
@@ -945,9 +862,7 @@ float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_s16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[A]], i32 2)
 // CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
 //
 int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
@@ -957,9 +872,7 @@ int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_s16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[A]], i32 2)
 // CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
 //
 int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
@@ -969,9 +882,7 @@ int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_u16_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[A]], i32 2)
 // CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
 //
 uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
@@ -981,9 +892,7 @@ uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_u16_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[A]], i32 2)
 // CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
 //
 uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) {
@@ -1013,8 +922,6 @@ float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmax_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMAX2_I]]
 //
@@ -1025,8 +932,6 @@ float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMAX2_I]]
 //
@@ -1037,8 +942,6 @@ float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMAXNM2_I]]
 //
@@ -1049,8 +952,6 @@ float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMAXNM2_I]]
 //
@@ -1061,8 +962,6 @@ float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmin_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMIN2_I]]
 //
@@ -1073,8 +972,6 @@ float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMIN2_I]]
 //
@@ -1085,8 +982,6 @@ float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMINNM2_I]]
 //
@@ -1097,8 +992,6 @@ float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMINNM2_I]]
 //
@@ -1129,8 +1022,6 @@ float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulx_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
@@ -1141,8 +1032,6 @@ float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
@@ -1153,10 +1042,7 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpadd_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VPADD_V2_I]]
 //
 float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
@@ -1166,10 +1052,7 @@ float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpaddq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
-// CHECK-NEXT:    [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VPADDQ_V2_I]]
 //
 float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
@@ -1179,8 +1062,6 @@ float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpmax_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMAX2_I]]
 //
@@ -1191,8 +1072,6 @@ float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpmaxq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMAX2_I]]
 //
@@ -1203,8 +1082,6 @@ float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnm_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMAXNM2_I]]
 //
@@ -1215,8 +1092,6 @@ float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnmq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMAXNM2_I]]
 //
@@ -1227,8 +1102,6 @@ float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpmin_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMIN2_I]]
 //
@@ -1239,8 +1112,6 @@ float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpminq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMIN2_I]]
 //
@@ -1251,8 +1122,6 @@ float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpminnm_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMINNM2_I]]
 //
@@ -1263,8 +1132,6 @@ float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vpminnmq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMINNM2_I]]
 //
@@ -1275,10 +1142,7 @@ float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrecps_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]])
-// CHECK-NEXT:    [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VRECPS_V2_I]]
 //
 float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
@@ -1288,10 +1152,7 @@ float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrecpsq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]])
-// CHECK-NEXT:    [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VRECPSQ_V2_I]]
 //
 float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
@@ -1301,10 +1162,7 @@ float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrsqrts_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]])
-// CHECK-NEXT:    [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VRSQRTS_V2_I]]
 //
 float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
@@ -1314,10 +1172,7 @@ float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vrsqrtsq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]])
-// CHECK-NEXT:    [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VRSQRTSQ_V2_I]]
 //
 float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) {
@@ -1347,11 +1202,8 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
-// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_f16(a, b, c);
@@ -1360,11 +1212,8 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_f16(a, b, c);
@@ -1374,11 +1223,8 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
-// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_f16(a, b, c);
@@ -1388,11 +1234,8 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
@@ -1401,14 +1244,8 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[FMLA2]]
 //
 float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@@ -1418,14 +1255,8 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[FMLA2]]
 //
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
@@ -1435,15 +1266,9 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfma_laneq_f16(a, b, c, 7);
@@ -1452,15 +1277,9 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_laneq_f16(a, b, c, 7);
@@ -1469,15 +1288,10 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
-// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfma_n_f16(a, b, c);
@@ -1486,19 +1300,10 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
@@ -1507,7 +1312,7 @@ float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
 // CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
@@ -1518,7 +1323,7 @@ float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
 // CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
@@ -1530,14 +1335,8 @@ float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[FMLA2]]
 //
 float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@@ -1548,14 +1347,8 @@ float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[FMLA2]]
 //
 float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
@@ -1566,15 +1359,9 @@ float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfms_laneq_f16(a, b, c, 7);
@@ -1584,15 +1371,9 @@ float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_laneq_f16(a, b, c, 7);
@@ -1602,15 +1383,10 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
-// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfms_n_f16(a, b, c);
@@ -1620,19 +1396,10 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmsq_n_f16(a, b, c);
@@ -1641,10 +1408,8 @@ float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmsh_lane_f16
 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
-// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg half [[B]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3
 // CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
 // CHECK-NEXT:    ret half [[TMP1]]
 //
@@ -1655,10 +1420,8 @@ float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmsh_laneq_f16
 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
-// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = fneg half [[B]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7
 // CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
 // CHECK-NEXT:    ret half [[TMP1]]
 //
@@ -1669,9 +1432,7 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmul_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x half> [[MUL]]
 //
@@ -1682,9 +1443,7 @@ float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulq_lane_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <8 x half> [[MUL]]
 //
@@ -1695,9 +1454,7 @@ float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmul_laneq_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x half> [[MUL]]
 //
@@ -1708,9 +1465,7 @@ float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulq_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <8 x half> [[MUL]]
 //
@@ -1721,10 +1476,8 @@ float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmul_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]]
 // CHECK-NEXT:    ret <4 x half> [[MUL]]
 //
@@ -1735,14 +1488,8 @@ float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulq_n_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]]
 // CHECK-NEXT:    ret <8 x half> [[MUL]]
 //
@@ -1754,18 +1501,9 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_847:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_847:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
-// CHECK-NEXT:    store <4 x half> [[B]], ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2
-// CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP1]] to float
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = fptrunc float [[MUL]] to half
-// CHECK-NEXT:    ret half [[TMP2]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = extractelement <4 x half> [[B]], i64 3
+// CHECK-NEXT:    [[TMP0:%.*]] = fmul half [[A]], [[DOTCAST1]]
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
   return vmulh_lane_f16(a, b, 3);
@@ -1774,18 +1512,9 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_850:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_850:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7
-// CHECK-NEXT:    store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2
-// CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP1]] to float
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = fptrunc float [[MUL]] to half
-// CHECK-NEXT:    ret half [[TMP2]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = extractelement <8 x half> [[B]], i64 7
+// CHECK-NEXT:    [[TMP0:%.*]] = fmul half [[A]], [[DOTCAST1]]
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulh_laneq_f16(a, b, 7);
@@ -1794,11 +1523,7 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulx_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
@@ -1809,11 +1534,7 @@ float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_lane_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
@@ -1824,11 +1545,7 @@ float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulx_laneq_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
@@ -1839,11 +1556,7 @@ float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_laneq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
@@ -1854,12 +1567,8 @@ float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulx_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
@@ -1870,16 +1579,8 @@ float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_n_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
@@ -1890,7 +1591,7 @@ float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxh_lane_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[B]], i32 3
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[B]], i64 3
 // CHECK-NEXT:    [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]])
 // CHECK-NEXT:    ret half [[VMULX]]
 //
@@ -1901,7 +1602,7 @@ float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmulxh_laneq_f16
 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[B]], i32 7
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[B]], i64 7
 // CHECK-NEXT:    [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]])
 // CHECK-NEXT:    ret half [[VMULX]]
 //
@@ -1912,9 +1613,7 @@ float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[VMAXV]])
+// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMAXV1]]
 //
 float16_t test_vmaxv_f16(float16x4_t a) {
@@ -1924,9 +1623,7 @@ float16_t test_vmaxv_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[VMAXV]])
+// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMAXV1]]
 //
 float16_t test_vmaxvq_f16(float16x8_t a) {
@@ -1936,9 +1633,7 @@ float16_t test_vmaxvq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VMINV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[VMINV]])
+// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMINV1]]
 //
 float16_t test_vminv_f16(float16x4_t a) {
@@ -1948,9 +1643,7 @@ float16_t test_vminv_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VMINV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[VMINV]])
+// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMINV1]]
 //
 float16_t test_vminvq_f16(float16x8_t a) {
@@ -1960,9 +1653,7 @@ float16_t test_vminvq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmv_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[VMAXNMV]])
+// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMAXNMV1]]
 //
 float16_t test_vmaxnmv_f16(float16x4_t a) {
@@ -1972,9 +1663,7 @@ float16_t test_vmaxnmv_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[VMAXNMV]])
+// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMAXNMV1]]
 //
 float16_t test_vmaxnmvq_f16(float16x8_t a) {
@@ -1984,9 +1673,7 @@ float16_t test_vmaxnmvq_f16(float16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmv_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[VMINNMV]])
+// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMINNMV1]]
 //
 float16_t test_vminnmv_f16(float16x4_t a) {
@@ -1996,9 +1683,7 @@ float16_t test_vminnmv_f16(float16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f16
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[VMINNMV]])
+// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret half [[VMINNMV1]]
 //
 float16_t test_vminnmvq_f16(float16x8_t a) {
diff --git a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c
index c44dd333c9754e..6ddf298d0dc6b8 100644
--- a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c
+++ b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c
@@ -1,120 +1,169 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v8.5a\
 // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vrnd32x_f32
-// CHECK:  [[RND:%.*]] =  call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> %a)
-// CHECK:  ret <2 x float> [[RND]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32x_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRND32X_F321_I]]
+//
 float32x2_t test_vrnd32x_f32(float32x2_t a) {
   return vrnd32x_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd32xq_f32
-// CHECK:  [[RND:%.*]] =  call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> %a)
-// CHECK:  ret <4 x float> [[RND]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32xq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRND32XQ_F321_I]]
+//
 float32x4_t test_vrnd32xq_f32(float32x4_t a) {
   return vrnd32xq_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd32z_f32
-// CHECK:  [[RND:%.*]] =  call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> %a)
-// CHECK:  ret <2 x float> [[RND]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32z_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRND32Z_F321_I]]
+//
 float32x2_t test_vrnd32z_f32(float32x2_t a) {
   return vrnd32z_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd32zq_f32
-// CHECK:  [[RND:%.*]] =  call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> %a)
-// CHECK:  ret <4 x float> [[RND]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32zq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRND32ZQ_F321_I]]
+//
 float32x4_t test_vrnd32zq_f32(float32x4_t a) {
   return vrnd32zq_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd64x_f32
-// CHECK:  [[RND:%.*]] =  call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> %a)
-// CHECK:  ret <2 x float> [[RND]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64x_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRND64X_F321_I]]
+//
 float32x2_t test_vrnd64x_f32(float32x2_t a) {
   return vrnd64x_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd64xq_f32
-// CHECK:  [[RND:%.*]] =  call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> %a)
-// CHECK:  ret <4 x float> [[RND]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64xq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRND64XQ_F321_I]]
+//
 float32x4_t test_vrnd64xq_f32(float32x4_t a) {
   return vrnd64xq_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd64z_f32
-// CHECK:  [[RND:%.*]] =  call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> %a)
-// CHECK:  ret <2 x float> [[RND]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64z_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRND64Z_F321_I]]
+//
 float32x2_t test_vrnd64z_f32(float32x2_t a) {
   return vrnd64z_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd64zq_f32
-// CHECK:  [[RND:%.*]] =  call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> %a)
-// CHECK:  ret <4 x float> [[RND]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64zq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRND64ZQ_F321_I]]
+//
 float32x4_t test_vrnd64zq_f32(float32x4_t a) {
   return vrnd64zq_f32(a);
 }
 
-// CHECK-LABEL: test_vrnd32x_f64
-// CHECK:  [[RND:%.*]] =  call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> %a)
-// CHECK:  ret <1 x double> [[RND]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32x_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRND32X_F641_I]]
+//
 float64x1_t test_vrnd32x_f64(float64x1_t a) {
   return vrnd32x_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd32xq_f64
-// CHECK:  [[RND:%.*]] =  call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> %a)
-// CHECK:  ret <2 x double> [[RND]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32xq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRND32XQ_F641_I]]
+//
 float64x2_t test_vrnd32xq_f64(float64x2_t a) {
   return vrnd32xq_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd32z_f64
-// CHECK:  [[RND:%.*]] =  call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> %a)
-// CHECK:  ret <1 x double> [[RND]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32z_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRND32Z_F641_I]]
+//
 float64x1_t test_vrnd32z_f64(float64x1_t a) {
   return vrnd32z_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd32zq_f64
-// CHECK:  [[RND:%.*]] =  call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> %a)
-// CHECK:  ret <2 x double> [[RND]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32zq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND32ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRND32ZQ_F641_I]]
+//
 float64x2_t test_vrnd32zq_f64(float64x2_t a) {
   return vrnd32zq_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd64x_f64
-// CHECK:  [[RND:%.*]] =  call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> %a)
-// CHECK:  ret <1 x double> [[RND]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64x_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRND64X_F641_I]]
+//
 float64x1_t test_vrnd64x_f64(float64x1_t a) {
   return vrnd64x_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd64xq_f64
-// CHECK:  [[RND:%.*]] =  call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> %a)
-// CHECK:  ret <2 x double> [[RND]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64xq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRND64XQ_F641_I]]
+//
 float64x2_t test_vrnd64xq_f64(float64x2_t a) {
   return vrnd64xq_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd64z_f64
-// CHECK:  [[RND:%.*]] =  call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> %a)
-// CHECK:  ret <1 x double> [[RND]]
+// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64z_f64(
+// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> [[A]])
+// CHECK-NEXT:    ret <1 x double> [[VRND64Z_F641_I]]
+//
 float64x1_t test_vrnd64z_f64(float64x1_t a) {
   return vrnd64z_f64(a);
 }
 
-// CHECK-LABEL: test_vrnd64zq_f64
-// CHECK:  [[RND:%.*]] =  call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> %a)
-// CHECK:  ret <2 x double> [[RND]]
+// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64zq_f64(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND64ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> [[A]])
+// CHECK-NEXT:    ret <2 x double> [[VRND64ZQ_F641_I]]
+//
 float64x2_t test_vrnd64zq_f64(float64x2_t a) {
   return vrnd64zq_f64(a);
 }
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 7bfeb7939edb27..8bfe2cffff58dd 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,147 +1,163 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | opt -S -passes=mem2reg,instcombine,sroa \
 // RUN: | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vmmlaq_s32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
+//
 int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
   return vmmlaq_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vmmlaq_u32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
+//
 uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
   return vmmlaq_u32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusmmlaq_s32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSMMLA1_I]]
+//
 int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
   return vusmmlaq_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusdot_s32
-// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
-// CHECK: ret <2 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
   return vusdot_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusdot_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
   return vusdot_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudot_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP0]], <8 x i8> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
   return vsudot_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vusdot_laneq_s32
-// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_laneq_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
   return vusdot_laneq_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudot_laneq_s32
-// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_laneq_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP0]], <8 x i8> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
   return vsudot_laneq_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vusdotq_s32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
   return vusdotq_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusdotq_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
   return vusdotq_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudotq_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP0]], <16 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
   return vsudotq_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vusdotq_laneq_s32
-// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_laneq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
   return vusdotq_laneq_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudotq_laneq_s32
-// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_laneq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP0]], <16 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
   return vsudotq_laneq_s32(r, a, b, 0);
 }
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 51aa5aa758f0c3..e6e486efbbb693 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -2,19 +2,19 @@
 // RUN: %clang_cc1 \
 // RUN:   -triple aarch64 -target-feature +neon -target-feature +bf16 \
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
-// RUN:   | opt -S -passes=mem2reg \
+// RUN:   | opt -S -passes=mem2reg,instcombine \
 // RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A64 %s
 // RUN: %clang_cc1 \
 // RUN:   -triple armv8.6a-arm-none-eabi -target-feature +neon \
 // RUN:   -target-feature +bf16 -mfloat-abi hard \
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
-// RUN:   | opt -S -passes=mem2reg \
+// RUN:   | opt -S -passes=mem2reg,instcombine \
 // RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A32-HARDFP %s
 // RUN: %clang_cc1 \
 // RUN:   -triple armv8.6a-arm-none-eabi -target-feature +neon \
 // RUN:   -target-feature +bf16 -mfloat-abi softfp \
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
-// RUN:   | opt -S -passes=mem2reg \
+// RUN:   | opt -S -passes=mem2reg,instcombine \
 // RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A32-SOFTFP %s
 
 // REQUIRES: arm-registered-target
@@ -24,51 +24,27 @@
 
 // CHECK-A64-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-A64-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 16
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A64-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_808_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP0]], ptr [[__P0_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_8081_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8081_I]], ptr [[__REINT_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP4]]
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A_COERCE:%.*]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
   return vcvt_f32_bf16(a);
@@ -76,72 +52,31 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A64-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP8]]
+// CHECK-A32-SOFTFP-NEXT:    [[__P01_I5_CAST:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5_CAST]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
   return vcvtq_low_f32_bf16(a);
@@ -149,72 +84,31 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A64-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A64-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16)
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-HARDFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16)
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP8]]
+// CHECK-A32-SOFTFP-NEXT:    [[__P01_I5_CAST:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5_CAST]], <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16)
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[DOTCAST1]]
 //
 float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
   return vcvtq_high_f32_bf16(a);
@@ -222,39 +116,20 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A:%.*]])
+// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]])
 // CHECK-A32-HARDFP-NEXT:    ret <4 x bfloat> [[VCVTFP2BF1_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP6]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]])
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <2 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
   return vcvt_bf16_f32(a);
@@ -262,58 +137,22 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A:%.*]])
 // CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I4:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE1_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE4_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> zeroinitializer, ptr [[COERCE1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <4 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP5]], ptr [[__P1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[__P01_I]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP7]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP8]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP10]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]])
+// CHECK-A32-SOFTFP-NEXT:    [[__P12_I_CAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[__P12_I_CAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
   return vcvtq_low_bf16_f32(a);
@@ -321,82 +160,25 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
+// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE:%.*]], <4 x float> [[A:%.*]])
 // CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
-// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]])
+// CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I8]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I11:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I12:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I8:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I3:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I4:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE4_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE5_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE6_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE8_I:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[INACTIVE:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[COERCE2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[INACTIVE_COERCE:%.*]], ptr [[INACTIVE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[INACTIVE1:%.*]] = load <8 x bfloat>, ptr [[INACTIVE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[INACTIVE1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP2]], ptr [[RETVAL_I8]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP5]], ptr [[__P0_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I7:%.*]] = load <8 x bfloat>, ptr [[__P0_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7]], <8 x bfloat> [[__P01_I7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I3]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP4]], ptr [[COERCE5_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP7]], ptr [[COERCE6_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP8]], ptr [[__P0_I12]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P01_I16:%.*]] = load <4 x bfloat>, ptr [[__P0_I12]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[SHUFFLE_I17]], ptr [[RETVAL_I11]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP10]], ptr [[COERCE8_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP11]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP12]], ptr [[COERCE2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP13]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP14]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]])
+// CHECK-A32-SOFTFP-NEXT:    [[__P01_I7_CAST:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7_CAST]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-SOFTFP-NEXT:    [[__P01_I16_CAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16_CAST]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-SOFTFP-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
   return vcvtq_high_bf16_f32(inactive, a);
@@ -423,15 +205,11 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 
 // CHECK-LABEL: @test_vcvtah_f32_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_I:%.*]] = alloca bfloat, align 2
-// CHECK-NEXT:    [[__REINT1_I:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32
-// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
-// CHECK-NEXT:    store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    ret float [[TMP1]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast bfloat [[A:%.*]] to i16
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[DOTCAST]] to i32
+// CHECK-NEXT:    [[SHL_I:%.*]] = shl nuw i32 [[CONV_I]], 16
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast i32 [[SHL_I]] to float
+// CHECK-NEXT:    ret float [[DOTCAST1]]
 //
 float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
   return vcvtah_f32_bf16(a);
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
index c4f0b78fc6a575..b357042843162c 100644
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -2,11 +2,11 @@
 // RUN: %clang_cc1 -triple armv8-arm-none-eabi \
 // RUN:   -target-feature +neon -target-feature +bf16 -mfloat-abi soft \
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s
 // RUN: %clang_cc1 -triple armv8-arm-none-eabi \
 // RUN:   -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
 // RUN:   -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg | FileCheck %s
+// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -14,10 +14,7 @@
 
 // CHECK-LABEL: @test_vbfdot_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -26,10 +23,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
 
 // CHECK-LABEL: @test_vbfdotq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -38,19 +32,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 
 // CHECK-LABEL: @test_vbfdot_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_128:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -59,19 +44,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 
 // CHECK-LABEL: @test_vbfdotq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_130:%.*]] = alloca <4 x float>, align 8
-// CHECK-NEXT:    store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -80,19 +56,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 
 // CHECK-LABEL: @test_vbfdot_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_132:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -101,19 +68,10 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 
 // CHECK-LABEL: @test_vbfdotq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT:    [[__REINT1_126:%.*]] = alloca <4 x float>, align 8
-// CHECK-NEXT:    store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -122,11 +80,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
 
 // CHECK-LABEL: @test_vbfmmlaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_F323_I]]
 //
 float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -135,11 +89,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlalbq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -148,11 +98,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlaltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -161,27 +107,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 
 // CHECK-LABEL: @test_vbfmlalbq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -190,27 +117,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 
 // CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_F323_I]]
 //
 float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -219,27 +127,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
 
 // CHECK-LABEL: @test_vbfmlaltq_lane_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -248,27 +137,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 
 // CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0
-// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1
-// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2
-// CHECK-NEXT:    [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3
-// CHECK-NEXT:    [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4
-// CHECK-NEXT:    [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5
-// CHECK-NEXT:    [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6
-// CHECK-NEXT:    [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
-// CHECK-NEXT:    [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_F323_I]]
 //
 float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
diff --git a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c
index b87d0e8eb68bb4..6ef05790b02178 100644
--- a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
-// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi soft \
-// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -19,10 +19,8 @@ bfloat16x4_t test_vcreate_bf16(uint64_t a) {
 
 // CHECK-LABEL: @test_vdup_n_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x bfloat> [[VECINIT_I]], <4 x bfloat> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x bfloat> [[VECINIT3_I]]
 //
 bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) {
@@ -31,14 +29,8 @@ bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) {
 
 // CHECK-LABEL: @test_vdupq_n_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x bfloat> [[VECINIT_I]], <8 x bfloat> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x bfloat> [[VECINIT7_I]]
 //
 bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) {
@@ -47,9 +39,7 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) {
 
 // CHECK-LABEL: @test_vdup_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x bfloat> [[LANE]]
 //
 bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) {
@@ -58,9 +48,7 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vdupq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <8 x bfloat> [[LANE]]
 //
 bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) {
@@ -69,9 +57,7 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vdup_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <4 x bfloat> [[LANE]]
 //
 bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) {
@@ -80,9 +66,7 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vdupq_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x bfloat> [[LANE]]
 //
 bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) {
@@ -100,7 +84,7 @@ bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) {
 
 // CHECK-LABEL: @test_vget_high_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) {
@@ -109,7 +93,7 @@ bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) {
 
 // CHECK-LABEL: @test_vget_low_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) {
@@ -118,7 +102,7 @@ bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) {
 
 // CHECK-LABEL: @test_vget_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) {
@@ -127,7 +111,7 @@ bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vgetq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) {
@@ -136,7 +120,7 @@ bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vset_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 1
 // CHECK-NEXT:    ret <4 x bfloat> [[VSET_LANE]]
 //
 bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
@@ -145,7 +129,7 @@ bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vsetq_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 7
 // CHECK-NEXT:    ret <8 x bfloat> [[VSET_LANE]]
 //
 bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
@@ -154,7 +138,7 @@ bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
 
 // CHECK-LABEL: @test_vduph_lane_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
@@ -163,7 +147,7 @@ bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
 
 // CHECK-LABEL: @test_vduph_laneq_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7
 // CHECK-NEXT:    ret bfloat [[VGET_LANE]]
 //
 bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) {
diff --git a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c
index 11de8ba1dab7a6..45923d9343c3f5 100644
--- a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c
+++ b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c
@@ -1,63 +1,77 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=UNCONSTRAINED-A32 %s
 // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=UNCONSTRAINED-A64 %s
 
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
 // RUN:     -ffp-exception-behavior=strict \
 // RUN:     -fexperimental-strict-floating-point \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CONSTRAINED-A32 %s
 // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
 // RUN:     -ffp-exception-behavior=strict \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s
-
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
-// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s
-// RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
-// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s
-
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
-// RUN:     -ffp-exception-behavior=strict \
-// RUN:     -fexperimental-strict-floating-point \
-// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s
-// RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
-// RUN:     -ffp-exception-behavior=strict \
-// RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CONSTRAINED-A64 %s
 
 // REQUIRES: arm-registered-target,aarch64-registered-target
 
 #include <arm_neon.h>
 
-// COMMON-LABEL: test_vrndi_f32
-// COMMONIR:      [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a)
-// CONSTRAINED:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM64:   frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-// COMMONIR:      ret <2 x float> [[VRNDI1_I]]
+// UNCONSTRAINED-A32-LABEL: define dso_local <2 x float> @test_vrndi_f32(
+// UNCONSTRAINED-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-A32-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-A32-NEXT:    [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]])
+// UNCONSTRAINED-A32-NEXT:    ret <2 x float> [[VRNDI_V1_I]]
+//
+// UNCONSTRAINED-A64-LABEL: define dso_local <2 x float> @test_vrndi_f32(
+// UNCONSTRAINED-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// UNCONSTRAINED-A64-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-A64-NEXT:    [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]])
+// UNCONSTRAINED-A64-NEXT:    ret <2 x float> [[VRNDI_V1_I]]
+//
+// CONSTRAINED-A32-LABEL: define dso_local <2 x float> @test_vrndi_f32(
+// CONSTRAINED-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-A32-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-A32-NEXT:    [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]]
+// CONSTRAINED-A32-NEXT:    ret <2 x float> [[VRNDI_V1_I]]
+//
+// CONSTRAINED-A64-LABEL: define dso_local <2 x float> @test_vrndi_f32(
+// CONSTRAINED-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CONSTRAINED-A64-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-A64-NEXT:    [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]]
+// CONSTRAINED-A64-NEXT:    ret <2 x float> [[VRNDI_V1_I]]
+//
 float32x2_t test_vrndi_f32(float32x2_t a) {
   return vrndi_f32(a);
 }
 
-// COMMON-LABEL: test_vrndiq_f32
-// COMMONIR:      [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a)
-// CONSTRAINED:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM32:   vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}}
-// CHECK-ASM64:   frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-// COMMONIR:      ret <4 x float> [[VRNDI1_I]]
+// UNCONSTRAINED-A32-LABEL: define dso_local <4 x float> @test_vrndiq_f32(
+// UNCONSTRAINED-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-A32-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-A32-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]])
+// UNCONSTRAINED-A32-NEXT:    ret <4 x float> [[VRNDIQ_V1_I]]
+//
+// UNCONSTRAINED-A64-LABEL: define dso_local <4 x float> @test_vrndiq_f32(
+// UNCONSTRAINED-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// UNCONSTRAINED-A64-NEXT:  [[ENTRY:.*:]]
+// UNCONSTRAINED-A64-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]])
+// UNCONSTRAINED-A64-NEXT:    ret <4 x float> [[VRNDIQ_V1_I]]
+//
+// CONSTRAINED-A32-LABEL: define dso_local <4 x float> @test_vrndiq_f32(
+// CONSTRAINED-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-A32-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-A32-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-A32-NEXT:    ret <4 x float> [[VRNDIQ_V1_I]]
+//
+// CONSTRAINED-A64-LABEL: define dso_local <4 x float> @test_vrndiq_f32(
+// CONSTRAINED-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CONSTRAINED-A64-NEXT:  [[ENTRY:.*:]]
+// CONSTRAINED-A64-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]]
+// CONSTRAINED-A64-NEXT:    ret <4 x float> [[VRNDIQ_V1_I]]
+//
 float32x4_t test_vrndiq_f32(float32x4_t a) {
   return vrndiq_f32(a);
 }
diff --git a/clang/test/CodeGen/arm-neon-directed-rounding.c b/clang/test/CodeGen/arm-neon-directed-rounding.c
index 63ec016b49a0c8..578ca853114267 100644
--- a/clang/test/CodeGen/arm-neon-directed-rounding.c
+++ b/clang/test/CodeGen/arm-neon-directed-rounding.c
@@ -1,130 +1,239 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CHECK,CHECK-A32 %s
 // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \
 // RUN:     -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \
-// RUN:     opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A64 %s
+// RUN:     opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CHECK,CHECK-A64 %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnda_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDA_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnda_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRNDA_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnda_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDA1_I]]
+//
 float32x2_t test_vrnda_f32(float32x2_t a) {
   return vrnda_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndaq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDAQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndaq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDAQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndaq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDA1_I]]
+//
 float32x4_t test_vrndaq_f32(float32x4_t a) {
   return vrndaq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndm_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDM_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndm_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRNDM_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndm_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDM1_I]]
+//
 float32x2_t test_vrndm_f32(float32x2_t a) {
   return vrndm_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndmq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDMQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndmq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDMQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndmq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDM1_I]]
+//
 float32x4_t test_vrndmq_f32(float32x4_t a) {
   return vrndmq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndn_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDN_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndn_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRNDN_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndn_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDN1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDN1_I]]
+//
 float32x2_t test_vrndn_f32(float32x2_t a) {
   return vrndn_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndnq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDNQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndnq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDNQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndnq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDN1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDN1_I]]
+//
 float32x4_t test_vrndnq_f32(float32x4_t a) {
   return vrndnq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndp_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDP_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndp_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRNDP_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndp_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDP1_I]]
+//
 float32x2_t test_vrndp_f32(float32x2_t a) {
   return vrndp_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndpq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDPQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndpq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDPQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndpq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDP1_I]]
+//
 float32x4_t test_vrndpq_f32(float32x4_t a) {
   return vrndpq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndx_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDX_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndx_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRNDX_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndx_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDX1_I]]
+//
 float32x2_t test_vrndx_f32(float32x2_t a) {
   return vrndx_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndxq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDXQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndxq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDXQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndxq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDX1_I]]
+//
 float32x4_t test_vrndxq_f32(float32x4_t a) {
   return vrndxq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnd_f32(<2 x float> noundef %a)
-// CHECK-A32: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
-// CHECK-A64: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRND_V1_I]]
+// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnd_f32(
+// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[A]])
+// CHECK-A32-NEXT:    ret <2 x float> [[VRND_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnd_f32(
+// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[A]])
+// CHECK-A64-NEXT:    ret <2 x float> [[VRNDZ1_I]]
+//
 float32x2_t test_vrnd_f32(float32x2_t a) {
   return vrnd_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndq_f32(<4 x float> noundef %a)
-// CHECK-A32: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
-// CHECK-A64: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDQ_V1_I]]
+// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndq_f32(
+// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[A]])
+// CHECK-A32-NEXT:    ret <4 x float> [[VRNDQ_V1_I]]
+//
+// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndq_f32(
+// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[A]])
+// CHECK-A64-NEXT:    ret <4 x float> [[VRNDZ1_I]]
+//
 float32x4_t test_vrndq_f32(float32x4_t a) {
   return vrndq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vrndns_f32(float noundef %a)
-// CHECK-A32: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float %a)
-// CHECK-A64: [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float %a)
-// CHECK: ret float [[VRNDN_I]]
+// CHECK-A32-LABEL: define dso_local float @test_vrndns_f32(
+// CHECK-A32-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A32-NEXT:    [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float [[A]])
+// CHECK-A32-NEXT:    ret float [[VRNDN_I]]
+//
+// CHECK-A64-LABEL: define dso_local float @test_vrndns_f32(
+// CHECK-A64-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float [[A]])
+// CHECK-A64-NEXT:    ret float [[VRNDN_I]]
+//
 float32_t test_vrndns_f32(float32_t a) {
   return vrndns_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndi_f32(<2 x float> noundef %a)
-// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
-// CHECK: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a)
-// CHECK: ret <2 x float> [[VRNDI1_I]]
+// CHECK-LABEL: define dso_local <2 x float> @test_vrndi_f32(
+// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[VRNDI_V1_I]]
+//
 float32x2_t test_vrndi_f32(float32x2_t a) {
   return vrndi_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndiq_f32(<4 x float> noundef %a)
-// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
-// CHECK: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a)
-// CHECK: ret <4 x float> [[VRNDI1_I]]
+// CHECK-LABEL: define dso_local <4 x float> @test_vrndiq_f32(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[VRNDIQ_V1_I]]
+//
 float32x4_t test_vrndiq_f32(float32x4_t a) {
   return vrndiq_f32(a);
 }
diff --git a/clang/test/CodeGen/arm-neon-fma.c b/clang/test/CodeGen/arm-neon-fma.c
index 682eda9750c81a..64bc2ebbe9bdab 100644
--- a/clang/test/CodeGen/arm-neon-fma.c
+++ b/clang/test/CodeGen/arm-neon-fma.c
@@ -4,7 +4,7 @@
 // RUN:   -target-cpu cortex-a7 \
 // RUN:   -mfloat-abi hard \
 // RUN:   -ffreestanding \
-// RUN:   -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -13,11 +13,8 @@
 // CHECK-LABEL: define {{[^@]+}}@test_fma_order
 // CHECK-SAME: (<2 x float> noundef [[ACCUM:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
@@ -26,11 +23,8 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs)
 // CHECK-LABEL: define {{[^@]+}}@test_fmaq_order
 // CHECK-SAME: (<4 x float> noundef [[ACCUM:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
@@ -39,13 +33,10 @@ float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs)
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfma_n_f32(a, b, n);
@@ -54,15 +45,10 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmaq_n_f32(a, b, n);
diff --git a/clang/test/CodeGen/arm-neon-numeric-maxmin.c b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
index d2d4fee10f079a..79d752574c780f 100644
--- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -8,10 +8,7 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMAXNM_V2_I]]
 //
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
@@ -19,12 +16,9 @@ float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMAXNMQ_V2_I]]
 //
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
@@ -34,10 +28,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMINNM_V2_I]]
 //
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
@@ -45,12 +36,9 @@ float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMINNMQ_V2_I]]
 //
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
diff --git a/clang/test/CodeGen/arm-neon-vcvtX.c b/clang/test/CodeGen/arm-neon-vcvtX.c
index c087b92102c5bb..e9493fa1cf8cfd 100644
--- a/clang/test/CodeGen/arm-neon-vcvtX.c
+++ b/clang/test/CodeGen/arm-neon-vcvtX.c
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -8,7 +8,6 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTA_S32_V1_I]]
 //
@@ -19,7 +18,6 @@ int32x2_t test_vcvta_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTA_U32_V1_I]]
 //
@@ -28,9 +26,8 @@ uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 //
@@ -39,9 +36,8 @@ int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_u32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 //
@@ -52,7 +48,6 @@ uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTN_S32_V1_I]]
 //
@@ -63,7 +58,6 @@ int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTN_U32_V1_I]]
 //
@@ -72,9 +66,8 @@ uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 //
@@ -83,9 +76,8 @@ int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 //
@@ -96,7 +88,6 @@ uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTP_S32_V1_I]]
 //
@@ -107,7 +98,6 @@ int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTP_U32_V1_I]]
 //
@@ -116,9 +106,8 @@ uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 //
@@ -127,9 +116,8 @@ int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 //
@@ -140,7 +128,6 @@ uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTM_S32_V1_I]]
 //
@@ -151,7 +138,6 @@ int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u32_f32
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTM_U32_V1_I]]
 //
@@ -160,9 +146,8 @@ uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 //
@@ -171,9 +156,8 @@ int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u32_f32
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 //
diff --git a/clang/test/CodeGen/arm-poly-add.c b/clang/test/CodeGen/arm-poly-add.c
index 201a03a5bc8b6d..187ac6e8cd6a74 100644
--- a/clang/test/CodeGen/arm-poly-add.c
+++ b/clang/test/CodeGen/arm-poly-add.c
@@ -1,73 +1,68 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi \
 // RUN:   -target-feature +neon \
 // RUN:   -mfloat-abi hard \
-// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \
 // RUN:  | FileCheck %s
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vadd_p8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <8 x i8> @test_vadd_p8(
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i8> [[A]], [[B]]
 // CHECK-NEXT:    ret <8 x i8> [[TMP0]]
 //
 poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) {
   return vadd_p8 (a, b);
 }
 
-// CHECK-LABEL: @test_vadd_p16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <4 x i16> @test_vadd_p16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) {
   return vadd_p16 (a, b);
 }
 
-// CHECK-LABEL: @test_vadd_p64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <1 x i64> @test_vadd_p64(
+// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) {
   return vadd_p64(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_p8(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <16 x i8> @test_vaddq_p8(
+// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <16 x i8> [[A]], [[B]]
 // CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){
   return vaddq_p8(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_p16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <8 x i16> @test_vaddq_p16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){
   return vaddq_p16(a, b);
 }
 
-// CHECK-LABEL: @test_vaddq_p64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <2 x i64> @test_vaddq_p64(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[A]], [[B]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){
   return vaddq_p64(a, b);
diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
index 555f8ccba7c3c0..a15c67ff46487b 100644
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon \
-// RUN:  -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
+// RUN:  -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,instcombine,dce -S \
 // RUN:  | FileCheck %s --check-prefix=CHECK-ARM
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
-// RUN:  -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \
+// RUN:  -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,instcombine,dce -S \
 // RUN:  | FileCheck %s --check-prefix=CHECK-AARCH64
 
 // REQUIRES: arm-registered-target,aarch64-registered-target
@@ -13,13 +13,13 @@
 
 // CHECK-ARM-LABEL: @test_vqrdmlah_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_S163_I]]
 //
 int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
@@ -28,13 +28,13 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlah_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_S323_I]]
 //
 int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
@@ -43,13 +43,13 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlahq_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_S163_I]]
 //
 int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 
@@ -58,13 +58,13 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlahq_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_S323_I]]
 //
 int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 
@@ -73,19 +73,15 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlah_lane_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_S163_I]]
 //
 int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
@@ -94,19 +90,15 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlah_lane_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_S323_I]]
 //
 int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
@@ -115,19 +107,15 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_S163_I]]
 //
 int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 
@@ -136,19 +124,15 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_S323_I]]
 //
 int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 
@@ -157,13 +141,13 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlsh_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_S163_I]]
 //
 int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
@@ -172,13 +156,13 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlsh_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_S323_I]]
 //
 int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
@@ -187,13 +171,13 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlshq_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_S163_I]]
 //
 int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 
@@ -202,13 +186,13 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlshq_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
-// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
-// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_S323_I]]
 //
 int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 
@@ -217,19 +201,15 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_S163_I]]
 //
 int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
@@ -238,19 +218,15 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_S323_I]]
 //
 int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
@@ -259,19 +235,15 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_S163_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_S163_I]]
 //
 int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 
@@ -280,19 +252,15 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 
 // CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
+// CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_S323_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
-// CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
-// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
+// CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
+// CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_S323_I]]
 //
 int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 
diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
index 5f1cb34e6603d8..d8ed462d972524 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature -fullfp16 \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=sroa \
+// RUN: | opt -S -passes=mem2reg,sroa,instcombine \
 // RUN: | FileCheck %s --check-prefixes=CHECK-NOFP16
 // RUN: %clang_cc1 -triple armv8a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=sroa \
+// RUN: | opt -S -passes=mem2reg,sroa,instcombine \
 // RUN: | FileCheck %s --check-prefixes=CHECK-FP16
 
 // REQUIRES: arm-registered-target
@@ -15,21 +15,12 @@
 // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vbsl_f16(
 // CHECK-NOFP16-SAME: <4 x i16> noundef [[A:%.*]], <2 x i32> noundef [[B_COERCE:%.*]], <2 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> [[TMP8]])
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP12]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C_COERCE]] to <8 x i8>
+// CHECK-NOFP16-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP3]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vbsl_f16(
 // CHECK-FP16-SAME: <4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -48,21 +39,12 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vbslq_f16(
 // CHECK-NOFP16-SAME: <8 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B_COERCE:%.*]], <4 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP12]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[C_COERCE]] to <16 x i8>
+// CHECK-NOFP16-NEXT:    [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP3]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vbslq_f16(
 // CHECK-FP16-SAME: <8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
@@ -81,35 +63,23 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vzip_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META3]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VZIP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vzip_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-FP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-FP16-NEXT:    store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-FP16-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-FP16-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]]
+// CHECK-FP16-NEXT:    store <4 x half> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
@@ -119,35 +89,23 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vzipq_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META6]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NOFP16-NEXT:    [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VZIP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vzipq_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-FP16-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-FP16-NEXT:    store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-FP16-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-FP16-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]]
+// CHECK-FP16-NEXT:    store <8 x half> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META6]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
@@ -157,35 +115,23 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vuzp_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META9]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VUZP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vuzp_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-FP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-FP16-NEXT:    store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-FP16-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-FP16-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]]
+// CHECK-FP16-NEXT:    store <4 x half> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META9]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
@@ -195,35 +141,23 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vuzpq_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META12]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NOFP16-NEXT:    [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VUZP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vuzpq_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-FP16-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-FP16-NEXT:    store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-FP16-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-FP16-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]]
+// CHECK-FP16-NEXT:    store <8 x half> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
@@ -233,35 +167,23 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vtrn_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META15]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NOFP16-NEXT:    store <4 x i16> [[VTRN3_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vtrn_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-FP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-FP16-NEXT:    store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-FP16-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-FP16-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]]
+// CHECK-FP16-NEXT:    store <4 x half> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META15]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
@@ -271,35 +193,23 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NOFP16-LABEL: define dso_local void @test_vtrnq_f16(
 // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
-// CHECK-NOFP16-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
-// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META18]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]]
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NOFP16-NEXT:    [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NOFP16-NEXT:    store <8 x i16> [[VTRN3_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]]
 // CHECK-NOFP16-NEXT:    ret void
 //
 // CHECK-FP16-LABEL: define dso_local void @test_vtrnq_f16(
 // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-FP16-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-FP16-NEXT:    store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1
+// CHECK-FP16-NEXT:    store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]]
+// CHECK-FP16-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-FP16-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-FP16-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]]
+// CHECK-FP16-NEXT:    store <8 x half> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META18]]
 // CHECK-FP16-NEXT:    ret void
 //
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
@@ -309,20 +219,16 @@ float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vmov_n_f16(
 // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32>
 // CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP0]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vmov_n_f16(
 // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-FP16-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vmov_n_f16(float16_t a) {
@@ -332,28 +238,16 @@ float16x4_t test_vmov_n_f16(float16_t a) {
 // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vmovq_n_f16(
 // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NOFP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NOFP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NOFP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32>
 // CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP0]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vmovq_n_f16(
 // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-FP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-FP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-FP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-FP16-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vmovq_n_f16(float16_t a) {
@@ -363,20 +257,16 @@ float16x8_t test_vmovq_n_f16(float16_t a) {
 // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vdup_n_f16(
 // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32>
 // CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP0]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_n_f16(
 // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0
-// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0
+// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-FP16-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vdup_n_f16(float16_t a) {
@@ -386,28 +276,16 @@ float16x4_t test_vdup_n_f16(float16_t a) {
 // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_n_f16(
 // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-NOFP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-NOFP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-NOFP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-NOFP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-NOFP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-NOFP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NOFP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-NOFP16-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32>
 // CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP0]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_n_f16(
 // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0
-// CHECK-FP16-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
-// CHECK-FP16-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
-// CHECK-FP16-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
-// CHECK-FP16-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
-// CHECK-FP16-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
-// CHECK-FP16-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
-// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-FP16-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0
+// CHECK-FP16-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-FP16-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vdupq_n_f16(float16_t a) {
@@ -418,19 +296,14 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP4]]
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP2]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_lane_f16(
 // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-FP16-NEXT:    ret <4 x half> [[LANE]]
 //
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
@@ -440,20 +313,15 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) {
 // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_lane_f16(
 // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[LANE]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP1]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_lane_f16(
 // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-FP16-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-FP16-NEXT:    ret <8 x half> [[LANE]]
 //
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
@@ -463,25 +331,16 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) {
 // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vext_f16(
 // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[VEXT]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP7]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16>
+// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VEXT]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP2]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vext_f16(
 // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 // CHECK-FP16-NEXT:    ret <4 x half> [[VEXT]]
 //
 float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
@@ -491,25 +350,16 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vextq_f16(
 // CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
-// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <16 x i8>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-// CHECK-NOFP16-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[VEXT]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP7]]
+// CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16>
+// CHECK-NOFP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[VEXT]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP2]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vextq_f16(
 // CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-FP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-FP16-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-FP16-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-FP16-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
 // CHECK-FP16-NEXT:    ret <8 x half> [[VEXT]]
 //
 float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
@@ -520,18 +370,14 @@ float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <2 x i32>
-// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP5]]
+// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <2 x i32>
+// CHECK-NOFP16-NEXT:    ret <2 x i32> [[TMP1]]
 //
 // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vrev64_f16(
 // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK-FP16-NEXT:    ret <4 x half> [[SHUFFLE_I]]
 //
 float16x4_t test_vrev64_f16(float16x4_t a) {
@@ -542,18 +388,14 @@ float16x4_t test_vrev64_f16(float16x4_t a) {
 // CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NOFP16-NEXT:  entry:
 // CHECK-NOFP16-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK-NOFP16-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[SHUFFLE_I]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half>
-// CHECK-NOFP16-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x i32>
-// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP5]]
+// CHECK-NOFP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP0]], <8 x half> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NOFP16-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[SHUFFLE_I]] to <4 x i32>
+// CHECK-NOFP16-NEXT:    ret <4 x i32> [[TMP1]]
 //
 // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vrev64q_f16(
 // CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-FP16-NEXT:  entry:
-// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-FP16-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-FP16-NEXT:    ret <8 x half> [[SHUFFLE_I]]
 //
 float16x8_t test_vrev64q_f16(float16x8_t a) {
diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
index 59f56b988d2ab5..fc8ae4f9a1c045 100644
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@@ -1,819 +1,1138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=mem2reg,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vabs_f16
-// CHECK:  [[ABS:%.*]] =  call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[ABS]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vabs_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VABS1_I]]
+//
 float16x4_t test_vabs_f16(float16x4_t a) {
   return vabs_f16(a);
 }
 
-// CHECK-LABEL: test_vabsq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[ABS]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vabsq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VABS1_I]]
+//
 float16x8_t test_vabsq_f16(float16x8_t a) {
   return vabsq_f16(a);
 }
 
-// CHECK-LABEL: test_vceqz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+//
 uint16x4_t test_vceqz_f16(float16x4_t a) {
   return vceqz_f16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+//
 uint16x8_t test_vceqzq_f16(float16x8_t a) {
   return vceqzq_f16(a);
 }
 
-// CHECK-LABEL: test_vcgez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGEZ_I]]
+//
 uint16x4_t test_vcgez_f16(float16x4_t a) {
   return vcgez_f16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGEZ_I]]
+//
 uint16x8_t test_vcgezq_f16(float16x8_t a) {
   return vcgezq_f16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGTZ_I]]
+//
 uint16x4_t test_vcgtz_f16(float16x4_t a) {
   return vcgtz_f16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGTZ_I]]
+//
 uint16x8_t test_vcgtzq_f16(float16x8_t a) {
   return vcgtzq_f16(a);
 }
 
-// CHECK-LABEL: test_vclez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCLEZ_I]]
+//
 uint16x4_t test_vclez_f16(float16x4_t a) {
   return vclez_f16(a);
 }
 
-// CHECK-LABEL: test_vclezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCLEZ_I]]
+//
 uint16x8_t test_vclezq_f16(float16x8_t a) {
   return vclezq_f16(a);
 }
 
-// CHECK-LABEL: test_vcltz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCLTZ_I]]
+//
 uint16x4_t test_vcltz_f16(float16x4_t a) {
   return vcltz_f16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCLTZ_I]]
+//
 uint16x8_t test_vcltzq_f16(float16x8_t a) {
   return vcltzq_f16(a);
 }
 
-// CHECK-LABEL: test_vcvt_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
+//
 float16x4_t test_vcvt_f16_s16 (int16x4_t a) {
   return vcvt_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
+//
 float16x8_t test_vcvtq_f16_s16 (int16x8_t a) {
   return vcvtq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vcvt_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
+//
 float16x4_t test_vcvt_f16_u16 (uint16x4_t a) {
   return vcvt_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
+//
 float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
   return vcvtq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vcvt_s16_f16
-// CHECK:  [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16>
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_I]]
+//
 int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
   return vcvt_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s16_f16
-// CHECK:  [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16>
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_I]]
+//
 int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
   return vcvtq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvt_u16_f16
-// CHECK:  [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16>
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_I]]
+//
 int16x4_t test_vcvt_u16_f16 (float16x4_t a) {
   return vcvt_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u16_f16
-// CHECK:  [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16>
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_I]]
+//
 int16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
   return vcvtq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvta_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTA_S16_F161_I]]
+//
 int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
   return vcvta_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvta_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTA_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTA_U16_F161_I]]
+//
 int16x4_t test_vcvta_u16_f16 (float16x4_t a) {
    return vcvta_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtaq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtaq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTAQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTAQ_S16_F161_I]]
+//
 int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
   return vcvtaq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtm_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTM_S16_F161_I]]
+//
 int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
   return vcvtm_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtmq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTMQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTMQ_S16_F161_I]]
+//
 int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
   return vcvtmq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtm_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTM_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTM_U16_F161_I]]
+//
 uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
   return vcvtm_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtmq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTMQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTMQ_U16_F161_I]]
+//
 uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
   return vcvtmq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtn_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTN_S16_F161_I]]
+//
 int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
   return vcvtn_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtnq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTNQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTNQ_S16_F161_I]]
+//
 int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
   return vcvtnq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtn_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTN_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTN_U16_F161_I]]
+//
 uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
   return vcvtn_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtnq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTNQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTNQ_U16_F161_I]]
+//
 uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
   return vcvtnq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtp_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTP_S16_F161_I]]
+//
 int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
   return vcvtp_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtpq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTPQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTPQ_S16_F161_I]]
+//
 int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
   return vcvtpq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtp_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTP_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTP_U16_F161_I]]
+//
 uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
   return vcvtp_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtpq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVTPQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTPQ_U16_F161_I]]
+//
 uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
   return vcvtpq_u16_f16(a);
 }
 
 // FIXME: Fix the zero constant when fp16 non-storage-only type becomes available.
-// CHECK-LABEL: test_vneg_f16
-// CHECK:  [[NEG:%.*]] = fneg <4 x half> %a
-// CHECK:  ret <4 x half> [[NEG]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vneg_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[A]]
+// CHECK-NEXT:    ret <4 x half> [[FNEG_I]]
+//
 float16x4_t test_vneg_f16(float16x4_t a) {
   return vneg_f16(a);
 }
 
-// CHECK-LABEL: test_vnegq_f16
-// CHECK:  [[NEG:%.*]] = fneg <8 x half> %a
-// CHECK:  ret <8 x half> [[NEG]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vnegq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[A]]
+// CHECK-NEXT:    ret <8 x half> [[FNEG_I]]
+//
 float16x8_t test_vnegq_f16(float16x8_t a) {
   return vnegq_f16(a);
 }
 
-// CHECK-LABEL: test_vrecpe_f16
-// CHECK:  [[RCP:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RCP]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrecpe_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRECPE_V1_I]]
+//
 float16x4_t test_vrecpe_f16(float16x4_t a) {
   return vrecpe_f16(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f16
-// CHECK:  [[RCP:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RCP]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrecpeq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRECPEQ_V1_I]]
+//
 float16x8_t test_vrecpeq_f16(float16x8_t a) {
   return vrecpeq_f16(a);
 }
 
-// CHECK-LABEL: test_vrnd_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrnd_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRND_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRND_V1_I]]
+//
 float16x4_t test_vrnd_f16(float16x4_t a) {
   return vrnd_f16(a);
 }
 
-// CHECK-LABEL: test_vrndq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDQ_V1_I]]
+//
 float16x8_t test_vrndq_f16(float16x8_t a) {
   return vrndq_f16(a);
 }
 
-// CHECK-LABEL: test_vrnda_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrnda_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDA_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRNDA_V1_I]]
+//
 float16x4_t test_vrnda_f16(float16x4_t a) {
   return vrnda_f16(a);
 }
 
-// CHECK-LABEL: test_vrndaq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndaq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDAQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDAQ_V1_I]]
+//
 float16x8_t test_vrndaq_f16(float16x8_t a) {
   return vrndaq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndm_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrndm_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDM_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRNDM_V1_I]]
+//
 float16x4_t test_vrndm_f16(float16x4_t a) {
   return vrndm_f16(a);
 }
 
-// CHECK-LABEL: test_vrndmq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndmq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDMQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDMQ_V1_I]]
+//
 float16x8_t test_vrndmq_f16(float16x8_t a) {
   return vrndmq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndn_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrndn_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDN_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRNDN_V1_I]]
+//
 float16x4_t test_vrndn_f16(float16x4_t a) {
   return vrndn_f16(a);
 }
 
-// CHECK-LABEL: test_vrndnq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndnq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDNQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDNQ_V1_I]]
+//
 float16x8_t test_vrndnq_f16(float16x8_t a) {
   return vrndnq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndp_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrndp_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDP_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRNDP_V1_I]]
+//
 float16x4_t test_vrndp_f16(float16x4_t a) {
   return vrndp_f16(a);
 }
 
-// CHECK-LABEL: test_vrndpq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndpq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDPQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDPQ_V1_I]]
+//
 float16x8_t test_vrndpq_f16(float16x8_t a) {
   return vrndpq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndx_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrndx_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDX_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRNDX_V1_I]]
+//
 float16x4_t test_vrndx_f16(float16x4_t a) {
   return vrndx_f16(a);
 }
 
-// CHECK-LABEL: test_vrndxq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrndxq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRNDXQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRNDXQ_V1_I]]
+//
 float16x8_t test_vrndxq_f16(float16x8_t a) {
   return vrndxq_f16(a);
 }
 
-// CHECK-LABEL: test_vrsqrte_f16
-// CHECK:  [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrte_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[VRSQRTE_V1_I]]
+//
 float16x4_t test_vrsqrte_f16(float16x4_t a) {
   return vrsqrte_f16(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_f16
-// CHECK:  [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrteq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[VRSQRTEQ_V1_I]]
+//
 float16x8_t test_vrsqrteq_f16(float16x8_t a) {
   return vrsqrteq_f16(a);
 }
 
-// CHECK-LABEL: test_vadd_f16
-// CHECK:  [[ADD:%.*]] = fadd <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vadd_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[ADD_I]]
+//
 float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) {
   return vadd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f16
-// CHECK:  [[ADD:%.*]] = fadd <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vaddq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[ADD_I]]
+//
 float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
   return vaddq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f16
-// CHECK:  [[ABD:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ABD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vabd_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VABD_V2_I]]
+//
 float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
   return vabd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f16
-// CHECK:  [[ABD:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[ABD]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vabdq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VABDQ_V2_I]]
+//
 float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
   return vabdq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcage_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcage_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VCAGE_V2_I]]
+//
 uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
   return vcage_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcageq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VCAGEQ_V2_I]]
+//
 uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
   return vcageq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcagt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcagt_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x i16> [[VCAGT_V2_I]]
+//
 uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
   return vcagt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcagtq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x i16> [[VCAGTQ_V2_I]]
+//
 uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
   return vcagtq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcale_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcale_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCALE_V2_I]]
+//
 uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
   return vcale_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcaleq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCALEQ_V2_I]]
+//
 uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
   return vcaleq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcalt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcalt_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x i16> [[VCALT_V2_I]]
+//
 uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
   return vcalt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcaltq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x i16> [[VCALTQ_V2_I]]
+//
 uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
   return vcaltq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) {
   return vceq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) {
   return vceqq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) {
   return vcge_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) {
   return vcgeq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) {
   return vcgt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) {
   return vcgtq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) {
   return vcle_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) {
   return vcleq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) {
   return vclt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) {
   return vcltq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcvt_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_s16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
+//
 float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
   return vcvt_n_f16_s16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_s16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
+//
 float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
   return vcvtq_n_f16_s16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_u16(
+// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
+//
 float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
   return vcvt_n_f16_u16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_u16(
+// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
+//
 float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
   return vcvtq_n_f16_u16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_s16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
+//
 int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
   return vcvt_n_s16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_s16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
+//
 int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
   return vcvtq_n_s16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
+// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_u16_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[A]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
+//
 uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
   return vcvt_n_u16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
+// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_u16_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[A]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
+//
 uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) {
   return vcvtq_n_u16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmax_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VMAX_V2_I]]
+//
 float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
   return vmax_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vmaxq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VMAXQ_V2_I]]
+//
 float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
   return vmaxq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxnm_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmaxnm_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VMAXNM_V2_I]]
+//
 float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
   return vmaxnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxnmq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vmaxnmq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMAXNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VMAXNMQ_V2_I]]
+//
 float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
   return vmaxnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmin_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VMIN_V2_I]]
+//
 float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
   return vmin_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vminq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VMINQ_V2_I]]
+//
 float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
   return vminq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminnm_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vminnm_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VMINNM_V2_I]]
+//
 float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
   return vminnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminnmq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vminnmq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMINNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VMINNMQ_V2_I]]
+//
 float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
   return vminnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f16
-// CHECK:  [[MUL:%.*]] = fmul <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmul_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[MUL_I]]
+//
 float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) {
   return vmul_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f16
-// CHECK:  [[MUL:%.*]] = fmul <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[MUL_I]]
+//
 float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
   return vmulq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vpadd_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VPADD_V2_I]]
+//
 float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
   return vpadd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vpmax_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VPMAX_V2_I]]
+//
 float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
   return vpmax_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vpmin_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VPMIN_V2_I]]
+//
 float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
   return vpmin_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrecps_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrecps_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VRECPS_V2_I]]
+//
 float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
  return vrecps_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrecpsq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VRECPSQ_V2_I]]
+//
 float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
   return vrecpsq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrts_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrts_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]])
+// CHECK-NEXT:    ret <4 x half> [[VRSQRTS_V2_I]]
+//
 float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
   return vrsqrts_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrtsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrtsq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]])
+// CHECK-NEXT:    ret <8 x half> [[VRSQRTSQ_V2_I]]
+//
 float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) {
   return vrsqrtsq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_f16
-// CHECK:  [[ADD:%.*]] = fsub <4 x half> %a, %b 
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vsub_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[SUB_I]]
+//
 float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) {
   return vsub_f16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_f16
-// CHECK:  [[ADD:%.*]] = fsub <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vsubq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[SUB_I]]
+//
 float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
   return vsubq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vfma_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vfma_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vfmaq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfms_f16
-// CHECK:  [[SUB:%.*]] = fneg <4 x half> %b
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vfms_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
 float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmsq_f16
-// CHECK:  [[SUB:%.*]] = fneg <8 x half> %b
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vfmsq_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmul_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x half> [[MUL]]
+//
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
   return vmul_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <8 x half> [[MUL]]
+//
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
   return vmulq_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[b:%.*]], i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[b]], i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[b]], i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[b]], i32 3
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP3]]
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <4 x half> @test_vmul_n_f16(
+// CHECK-SAME: <4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]]
+// CHECK-NEXT:    ret <4 x half> [[MUL]]
+//
 float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
   return vmul_n_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> poison, half [[b:%.*]], i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[b]], i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[b]], i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[b]], i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[b]], i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[b]], i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[b]], i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[b]], i32 7
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP7]]
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_n_f16(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]]
+// CHECK-NEXT:    ret <8 x half> [[MUL]]
+//
 float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
   return vmulq_n_f16(a, b);
 }
diff --git a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
index 947f42cdd0de96..baea770e2c3ca3 100644
--- a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
@@ -1,87 +1,101 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | opt -S -passes=mem2reg,sroa,instcombine \
 // RUN: | FileCheck %s
 
 // REQUIRES: arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vmmlaq_s32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
+//
 int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
   return vmmlaq_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vmmlaq_u32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
+//
 uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
   return vmmlaq_u32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusmmlaq_s32
-// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
-// CHECK: ret <4 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSMMLA1_I]]
+//
 int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
   return vusmmlaq_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusdot_s32
-// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
-// CHECK: ret <2 x i32> [[VAL]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
   return vusdot_s32(r, a, b);
 }
 
-// CHECK-LABEL: test_vusdot_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP3]])
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP1]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
   return vusdot_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudot_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8>
-// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP3]], <8 x i8> %a)
-// CHECK: ret <2 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32(
+// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP1]], <8 x i8> [[A]])
+// CHECK-NEXT:    ret <2 x i32> [[VUSDOT1_I]]
+//
 int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
   return vsudot_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vusdotq_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
-// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
   return vusdotq_lane_s32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vsudotq_lane_s32
-// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
-// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
-// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> %r to <16 x i8>
-// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %3, <16 x i8> %a)
-// CHECK: ret <4 x i32> [[OP]]
+// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32(
+// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP1]], <16 x i8> [[A]])
+// CHECK-NEXT:    ret <4 x i32> [[VUSDOT1_I]]
+//
 int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
   return vsudotq_lane_s32(r, a, b, 0);
 }
diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c
index 2da2d3bc8d0754..202350a805ece2 100644
--- a/clang/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c
@@ -1,57 +1,66 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
 // vdupq_n_f64 -> dup.2d v0, v0[0]
+// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(
+// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x double> [[VECINIT1_I]]
 //
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_n_f64(double noundef %w) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
-// CHECK:   ret <2 x double> [[VECINIT1_I]]
 float64x2_t test_vdupq_n_f64(float64_t w) {
     return vdupq_n_f64(w);
 }
 
 // might as well test this while we're here
 // vdupq_n_f32 -> dup.4s v0, v0[0]
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vdupq_n_f32(float noundef %w) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %w, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3
-// CHECK:   ret <4 x float> [[VECINIT3_I]]
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(
+// CHECK-SAME: float noundef [[W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[W]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x float> [[VECINIT3_I]]
+//
 float32x4_t test_vdupq_n_f32(float32_t w) {
     return vdupq_n_f32(w);
 }
 
 // vdupq_lane_f64 -> dup.2d v0, v0[0]
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_lane_f64(<1 x double> noundef %V) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x double> [[SHUFFLE]]
+// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(
+// CHECK-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[V]], <1 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x double> [[LANE]]
+//
 float64x2_t test_vdupq_lane_f64(float64x1_t V) {
     return vdupq_lane_f64(V, 0);
 }
 
 // vmovq_n_f64 -> dup Vd.2d,X0
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vmovq_n_f64(double noundef %w) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
-// CHECK:   ret <2 x double> [[VECINIT1_I]]
+// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(
+// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x double> [[VECINIT1_I]]
+//
 float64x2_t test_vmovq_n_f64(float64_t w) {
   return vmovq_n_f64(w);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x half> @test_vmov_n_f16(ptr noundef %a1) #0 {
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a1, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   ret <4 x half> [[VECINIT3]]
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(
+// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A1]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
 float16x4_t test_vmov_n_f16(float16_t *a1) {
   return vmov_n_f16(*a1);
 }
@@ -62,17 +71,14 @@ float64x1_t test_vmov_n_f64(float64_t a1) {
 }
 */
 
-// CHECK-LABEL: define{{.*}} <8 x half> @test_vmovq_n_f16(ptr noundef %a1) #0 {
-// CHECK:   [[TMP0:%.*]] = load half, ptr %a1, align 2
-// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
-// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
-// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
-// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
-// CHECK:   ret <8 x half> [[VECINIT7]]
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(
+// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A1]], align 2
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
 float16x8_t test_vmovq_n_f16(float16_t *a1) {
   return vmovq_n_f16(*a1);
 }
diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 1397e731d3da16..6177e01887c469 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -4,7 +4,7 @@
 // RUN:  -target-feature +fullfp16 -ffreestanding \
 // RUN:  -flax-vector-conversions=none \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
-// RUN:  | opt -S -passes=mem2reg | FileCheck %s
+// RUN:  | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
@@ -24,10 +24,7 @@ int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vaba_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
 //
@@ -38,10 +35,7 @@ int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vaba_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
 //
@@ -63,10 +57,7 @@ uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vaba_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
 //
@@ -77,10 +68,7 @@ uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vaba_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
 //
@@ -102,10 +90,7 @@ int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vabaq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[B]], <8 x i16> [[C]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
@@ -116,10 +101,7 @@ int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vabaq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[B]], <4 x i32> [[C]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -141,10 +123,7 @@ uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vabaq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[B]], <8 x i16> [[C]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
@@ -155,10 +134,7 @@ uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vabaq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[B]], <4 x i32> [[C]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
@@ -181,11 +157,7 @@ int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vabal_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -197,11 +169,7 @@ int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vabal_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -225,11 +193,7 @@ uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vabal_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -241,11 +205,7 @@ uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vabal_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -267,10 +227,7 @@ int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vabd_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VABD_V2_I]]
 //
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
@@ -280,10 +237,7 @@ int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vabd_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VABD_V2_I]]
 //
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
@@ -303,10 +257,7 @@ uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vabd_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VABD_V2_I]]
 //
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
@@ -316,10 +267,7 @@ uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vabd_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VABD_V2_I]]
 //
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
@@ -329,10 +277,7 @@ uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vabd_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VABD_V2_I]]
 //
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
@@ -352,10 +297,7 @@ int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vabdq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VABDQ_V2_I]]
 //
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
@@ -365,10 +307,7 @@ int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vabdq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VABDQ_V2_I]]
 //
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
@@ -388,10 +327,7 @@ uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vabdq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VABDQ_V2_I]]
 //
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
@@ -401,10 +337,7 @@ uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vabdq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VABDQ_V2_I]]
 //
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
@@ -414,10 +347,7 @@ uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <4 x float> @test_vabdq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VABDQ_V2_I]]
 //
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
@@ -438,11 +368,7 @@ int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vabdl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 //
@@ -453,11 +379,7 @@ int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vabdl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
 //
@@ -479,11 +401,7 @@ uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vabdl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 //
@@ -494,11 +412,7 @@ uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vabdl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
 //
@@ -519,7 +433,6 @@ int8x8_t test_vabs_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vabs_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VABS1_I]]
 //
@@ -530,7 +443,6 @@ int16x4_t test_vabs_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vabs_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VABS1_I]]
 //
@@ -541,7 +453,6 @@ int32x2_t test_vabs_s32(int32x2_t a) {
 // CHECK-LABEL: define <2 x float> @test_vabs_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x float> [[VABS1_I]]
 //
@@ -562,7 +473,6 @@ int8x16_t test_vabsq_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vabsq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VABS1_I]]
 //
@@ -573,7 +483,6 @@ int16x8_t test_vabsq_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vabsq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VABS1_I]]
 //
@@ -584,7 +493,6 @@ int32x4_t test_vabsq_s32(int32x4_t a) {
 // CHECK-LABEL: define <4 x float> @test_vabsq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x float> [[VABS1_I]]
 //
@@ -775,11 +683,9 @@ uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
 //
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
@@ -789,11 +695,9 @@ int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
 //
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
@@ -803,11 +707,9 @@ int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
 //
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
@@ -817,11 +719,9 @@ int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VADDHN2_I]]
 //
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
@@ -831,11 +731,9 @@ uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VADDHN2_I]]
 //
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
@@ -845,11 +743,9 @@ uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]]
 // CHECK-NEXT:    [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32)
-// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK-NEXT:    [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VADDHN2_I]]
 //
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
@@ -861,7 +757,7 @@ uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
@@ -871,11 +767,9 @@ int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vaddl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
@@ -885,11 +779,9 @@ int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vaddl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nsw <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
@@ -901,7 +793,7 @@ int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
@@ -911,11 +803,9 @@ uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vaddl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
@@ -925,11 +815,9 @@ uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vaddl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I5:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[VMOVL_I5]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
@@ -950,7 +838,6 @@ int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vaddw_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -962,7 +849,6 @@ int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vaddw_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -985,7 +871,6 @@ uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vaddw_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -997,7 +882,6 @@ uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vaddw_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -1629,8 +1513,6 @@ poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vcage_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
 // CHECK-NEXT:    ret <2 x i32> [[VCAGE_V2_I]]
 //
@@ -1641,8 +1523,6 @@ uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vcageq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VCAGEQ_V2_I]]
 //
@@ -1653,8 +1533,6 @@ uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vcagt_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
 // CHECK-NEXT:    ret <2 x i32> [[VCAGT_V2_I]]
 //
@@ -1665,8 +1543,6 @@ uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VCAGTQ_V2_I]]
 //
@@ -1677,8 +1553,6 @@ uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vcale_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCALE_V2_I]]
 //
@@ -1689,8 +1563,6 @@ uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCALEQ_V2_I]]
 //
@@ -1701,8 +1573,6 @@ uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vcalt_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCALT_V2_I]]
 //
@@ -1713,8 +1583,6 @@ uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCALTQ_V2_I]]
 //
@@ -2373,9 +2241,7 @@ int8x8_t test_vcls_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vcls_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
 //
 int16x4_t test_vcls_s16(int16x4_t a) {
@@ -2385,9 +2251,7 @@ int16x4_t test_vcls_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcls_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]])
-// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
 //
 int32x2_t test_vcls_s32(int32x2_t a) {
@@ -2407,9 +2271,7 @@ int8x8_t test_vcls_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vcls_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VCLS_V1_I]]
 //
 int16x4_t test_vcls_u16(uint16x4_t a) {
@@ -2419,9 +2281,7 @@ int16x4_t test_vcls_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcls_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]])
-// CHECK-NEXT:    [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VCLS_V1_I]]
 //
 int32x2_t test_vcls_u32(uint32x2_t a) {
@@ -2441,9 +2301,7 @@ int8x16_t test_vclsq_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vclsq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
 //
 int16x8_t test_vclsq_s16(int16x8_t a) {
@@ -2453,9 +2311,7 @@ int16x8_t test_vclsq_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vclsq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
 //
 int32x4_t test_vclsq_s32(int32x4_t a) {
@@ -2475,9 +2331,7 @@ int8x16_t test_vclsq_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vclsq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VCLSQ_V1_I]]
 //
 int16x8_t test_vclsq_u16(uint16x8_t a) {
@@ -2487,9 +2341,7 @@ int16x8_t test_vclsq_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vclsq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VCLSQ_V1_I]]
 //
 int32x4_t test_vclsq_u32(uint32x4_t a) {
@@ -2653,7 +2505,7 @@ uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vclz_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
 // CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
 //
 int8x8_t test_vclz_s8(int8x8_t a) {
@@ -2663,9 +2515,7 @@ int8x8_t test_vclz_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vclz_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 int16x4_t test_vclz_s16(int16x4_t a) {
@@ -2675,9 +2525,7 @@ int16x4_t test_vclz_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vclz_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 int32x2_t test_vclz_s32(int32x2_t a) {
@@ -2687,7 +2535,7 @@ int32x2_t test_vclz_s32(int32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vclz_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false)
 // CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
 //
 uint8x8_t test_vclz_u8(uint8x8_t a) {
@@ -2697,9 +2545,7 @@ uint8x8_t test_vclz_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vclz_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 uint16x4_t test_vclz_u16(uint16x4_t a) {
@@ -2709,9 +2555,7 @@ uint16x4_t test_vclz_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vclz_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 uint32x2_t test_vclz_u32(uint32x2_t a) {
@@ -2721,7 +2565,7 @@ uint32x2_t test_vclz_u32(uint32x2_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vclzq_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
 // CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
 //
 int8x16_t test_vclzq_s8(int8x16_t a) {
@@ -2731,9 +2575,7 @@ int8x16_t test_vclzq_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vclzq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
 // CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
 //
 int16x8_t test_vclzq_s16(int16x8_t a) {
@@ -2743,9 +2585,7 @@ int16x8_t test_vclzq_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vclzq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
 // CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
 //
 int32x4_t test_vclzq_s32(int32x4_t a) {
@@ -2755,7 +2595,7 @@ int32x4_t test_vclzq_s32(int32x4_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vclzq_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
+// CHECK-NEXT:    [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false)
 // CHECK-NEXT:    ret <16 x i8> [[VCLZQ_V_I]]
 //
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
@@ -2765,9 +2605,7 @@ uint8x16_t test_vclzq_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vclzq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false)
 // CHECK-NEXT:    ret <8 x i16> [[VCLZQ_V1_I]]
 //
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
@@ -2777,9 +2615,7 @@ uint16x8_t test_vclzq_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vclzq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
-// CHECK-NEXT:    [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK-NEXT:    [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false)
 // CHECK-NEXT:    ret <4 x i32> [[VCLZQ_V1_I]]
 //
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
@@ -2789,7 +2625,7 @@ uint32x4_t test_vclzq_u32(uint32x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vcnt_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
 //
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
@@ -2799,7 +2635,7 @@ uint8x8_t test_vcnt_u8(uint8x8_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vcnt_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
 //
 int8x8_t test_vcnt_s8(int8x8_t a) {
@@ -2809,7 +2645,7 @@ int8x8_t test_vcnt_s8(int8x8_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vcnt_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
 //
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
@@ -2819,7 +2655,7 @@ poly8x8_t test_vcnt_p8(poly8x8_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vcntq_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
 //
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
@@ -2829,7 +2665,7 @@ uint8x16_t test_vcntq_u8(uint8x16_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vcntq_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
 //
 int8x16_t test_vcntq_s8(int8x16_t a) {
@@ -2839,7 +2675,7 @@ int8x16_t test_vcntq_s8(int8x16_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vcntq_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
+// CHECK-NEXT:    [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    ret <16 x i8> [[VCNTQ_V_I]]
 //
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
@@ -2970,7 +2806,7 @@ poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
 //
 int8x8_t test_vcreate_s8(uint64_t a) {
@@ -2980,8 +2816,7 @@ int8x8_t test_vcreate_s8(uint64_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vcreate_imm(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 0 to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    ret <4 x i16> zeroinitializer
 //
 int16x4_t test_vcreate_imm(void) {
   return vcreate_s16(0);
@@ -2991,9 +2826,7 @@ int16x4_t test_vcreate_imm(void) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 int16x4_t test_vcreate_s16(uint64_t a) {
@@ -3004,9 +2837,7 @@ int16x4_t test_vcreate_s16(uint64_t a) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 int32x2_t test_vcreate_s32(uint64_t a) {
@@ -3037,7 +2868,7 @@ float32x2_t test_vcreate_f32(uint64_t a) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <8 x i8> [[VCLZ_V_I]]
 //
 int8x8_t test_vcreate_u8(uint64_t a) {
@@ -3048,9 +2879,7 @@ int8x8_t test_vcreate_u8(uint64_t a) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 int16x4_t test_vcreate_u16(uint64_t a) {
@@ -3061,9 +2890,7 @@ int16x4_t test_vcreate_u16(uint64_t a) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
-// CHECK-NEXT:    [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 int32x2_t test_vcreate_u32(uint64_t a) {
@@ -3073,8 +2900,8 @@ int32x2_t test_vcreate_u32(uint64_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vcreate_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[TMP0]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 uint64x1_t test_vcreate_u64(uint64_t a) {
@@ -3086,7 +2913,7 @@ uint64x1_t test_vcreate_u64(uint64_t a) {
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
 // CHECK-NEXT:    ret <8 x i8> [[VCNT_V_I]]
 //
 poly8x8_t test_vcreate_p8(uint64_t a) {
@@ -3096,13 +2923,12 @@ poly8x8_t test_vcreate_p8(uint64_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vcreate_p16(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 //
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
@@ -3112,8 +2938,8 @@ poly16x4_t test_vcreate_p16(uint64_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vcreate_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64>
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[TMP0]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 int64x1_t test_vcreate_s64(uint64_t a) {
@@ -3124,11 +2950,9 @@ int64x1_t test_vcreate_s64(uint64_t a) {
 // CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[A]])
-// CHECK-NEXT:    [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
 //
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
@@ -3137,7 +2961,6 @@ float16x4_t test_vcvt_f16_f32(float32x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float>
 // CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
 //
@@ -3148,7 +2971,6 @@ float32x2_t test_vcvt_f32_s32(int32x2_t a) {
 // CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
 // CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
 //
@@ -3159,7 +2981,6 @@ float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
 //
@@ -3170,7 +2991,6 @@ float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
 // CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[VCVT_I]]
 //
@@ -3181,10 +3001,8 @@ float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
 // CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F16_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
 // CHECK-NEXT:    [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
-// CHECK-NEXT:    [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I]]
 //
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
@@ -3194,9 +3012,7 @@ float32x4_t test_vcvt_f32_f16(float16x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[A]], i32 1)
 // CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
 //
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
@@ -3206,9 +3022,7 @@ float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
 // CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[A]], i32 1)
 // CHECK-NEXT:    ret <2 x float> [[VCVT_N1]]
 //
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
@@ -3218,9 +3032,7 @@ float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[A]], i32 3)
 // CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
 //
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
@@ -3230,9 +3042,7 @@ float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
 // CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[A]], i32 3)
 // CHECK-NEXT:    ret <4 x float> [[VCVT_N1]]
 //
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
@@ -3242,9 +3052,7 @@ float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[A]], i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
 //
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
@@ -3254,9 +3062,7 @@ int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[A]], i32 3)
 // CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
 //
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
@@ -3266,9 +3072,7 @@ int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[A]], i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VCVT_N1]]
 //
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
@@ -3278,9 +3082,7 @@ uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[A]], i32 3)
 // CHECK-NEXT:    ret <4 x i32> [[VCVT_N1]]
 //
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
@@ -3290,7 +3092,6 @@ uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <2 x float> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VCVT_I]]
 //
@@ -3301,7 +3102,6 @@ int32x2_t test_vcvt_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = fptosi <4 x float> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VCVT_I]]
 //
@@ -3312,7 +3112,6 @@ int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <2 x float> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VCVT_I]]
 //
@@ -3323,7 +3122,6 @@ uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VCVT_I:%.*]] = fptoui <4 x float> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VCVT_I]]
 //
@@ -3334,7 +3132,7 @@ uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
@@ -3344,9 +3142,7 @@ uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
@@ -3356,9 +3152,7 @@ uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    ret <2 x i32> [[LANE]]
 //
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
@@ -3368,7 +3162,7 @@ uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
@@ -3378,9 +3172,7 @@ int8x8_t test_vdup_lane_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
@@ -3390,9 +3182,7 @@ int16x4_t test_vdup_lane_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    ret <2 x i32> [[LANE]]
 //
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
@@ -3402,7 +3192,7 @@ int32x2_t test_vdup_lane_s32(int32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
@@ -3412,9 +3202,7 @@ poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
@@ -3424,9 +3212,7 @@ poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    ret <2 x float> [[LANE]]
 //
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
@@ -3436,7 +3222,7 @@ float32x2_t test_vdup_lane_f32(float32x2_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
@@ -3446,9 +3232,7 @@ uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
@@ -3458,9 +3242,7 @@ uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x i32> [[LANE]]
 //
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
@@ -3470,7 +3252,7 @@ uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
@@ -3480,9 +3262,7 @@ int8x16_t test_vdupq_lane_s8(int8x8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
@@ -3492,9 +3272,7 @@ int16x8_t test_vdupq_lane_s16(int16x4_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x i32> [[LANE]]
 //
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
@@ -3504,7 +3282,7 @@ int32x4_t test_vdupq_lane_s32(int32x2_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
@@ -3514,9 +3292,7 @@ poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
@@ -3526,9 +3302,7 @@ poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
 // CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    ret <4 x float> [[LANE]]
 //
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
@@ -3538,10 +3312,7 @@ float32x4_t test_vdupq_lane_f32(float32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
 //
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
@@ -3550,10 +3321,7 @@ int64x1_t test_vdup_lane_s64(int64x1_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
 //
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
@@ -3562,9 +3330,7 @@ uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[LANE]]
 //
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
@@ -3574,9 +3340,7 @@ int64x2_t test_vdupq_lane_s64(int64x1_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[LANE]]
 //
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
@@ -3586,14 +3350,8 @@ uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 uint8x8_t test_vdup_n_u8(uint8_t a) {
@@ -3603,10 +3361,8 @@ uint8x8_t test_vdup_n_u8(uint8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 uint16x4_t test_vdup_n_u16(uint16_t a) {
@@ -3616,8 +3372,8 @@ uint16x4_t test_vdup_n_u16(uint16_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
 //
 uint32x2_t test_vdup_n_u32(uint32_t a) {
@@ -3627,14 +3383,8 @@ uint32x2_t test_vdup_n_u32(uint32_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 int8x8_t test_vdup_n_s8(int8_t a) {
@@ -3644,10 +3394,8 @@ int8x8_t test_vdup_n_s8(int8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 int16x4_t test_vdup_n_s16(int16_t a) {
@@ -3657,8 +3405,8 @@ int16x4_t test_vdup_n_s16(int16_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
 //
 int32x2_t test_vdup_n_s32(int32_t a) {
@@ -3668,14 +3416,8 @@ int32x2_t test_vdup_n_s32(int32_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 poly8x8_t test_vdup_n_p8(poly8_t a) {
@@ -3685,10 +3427,8 @@ poly8x8_t test_vdup_n_p8(poly8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 poly16x4_t test_vdup_n_p16(poly16_t a) {
@@ -3699,10 +3439,8 @@ poly16x4_t test_vdup_n_p16(poly16_t a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vdup_n_f16(float16_t *a) {
@@ -3712,8 +3450,8 @@ float16x4_t test_vdup_n_f16(float16_t *a) {
 // CHECK-LABEL: define <2 x float> @test_vdup_n_f32(
 // CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x float> [[VECINIT1_I]]
 //
 float32x2_t test_vdup_n_f32(float32_t a) {
@@ -3723,22 +3461,8 @@ float32x2_t test_vdup_n_f32(float32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
@@ -3748,14 +3472,8 @@ uint8x16_t test_vdupq_n_u8(uint8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
@@ -3765,10 +3483,8 @@ uint16x8_t test_vdupq_n_u16(uint16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
@@ -3778,22 +3494,8 @@ uint32x4_t test_vdupq_n_u32(uint32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 int8x16_t test_vdupq_n_s8(int8_t a) {
@@ -3803,14 +3505,8 @@ int8x16_t test_vdupq_n_s8(int8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 int16x8_t test_vdupq_n_s16(int16_t a) {
@@ -3820,10 +3516,8 @@ int16x8_t test_vdupq_n_s16(int16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 int32x4_t test_vdupq_n_s32(int32_t a) {
@@ -3833,22 +3527,8 @@ int32x4_t test_vdupq_n_s32(int32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
@@ -3858,14 +3538,8 @@ poly8x16_t test_vdupq_n_p8(poly8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
@@ -3876,14 +3550,8 @@ poly16x8_t test_vdupq_n_p16(poly16_t a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vdupq_n_f16(float16_t *a) {
@@ -3893,10 +3561,8 @@ float16x8_t test_vdupq_n_f16(float16_t *a) {
 // CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(
 // CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x float> [[VECINIT3_I]]
 //
 float32x4_t test_vdupq_n_f32(float32_t a) {
@@ -3906,8 +3572,8 @@ float32x4_t test_vdupq_n_f32(float32_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 int64x1_t test_vdup_n_s64(int64_t a) {
@@ -3918,8 +3584,8 @@ int64x1_t test_vdup_n_s64(int64_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 int64x1_t test_vdup_n_u64(uint64_t a) {
@@ -3930,9 +3596,9 @@ int64x1_t test_vdup_n_u64(uint64_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = shl <2 x i64> [[VECINIT_I]], splat (i64 1)
+// CHECK-NEXT:    [[ADD_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 int64x2_t test_vdupq_n_s64(int64_t a) {
@@ -3943,9 +3609,9 @@ int64x2_t test_vdupq_n_s64(int64_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = shl <2 x i64> [[VECINIT_I]], splat (i64 1)
+// CHECK-NEXT:    [[ADD_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
@@ -4146,11 +3812,7 @@ poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vext_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x i16> [[VEXT]]
 //
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
@@ -4160,11 +3822,7 @@ int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vext_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x i16> [[VEXT]]
 //
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
@@ -4174,11 +3832,7 @@ uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vext_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x i16> [[VEXT]]
 //
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
@@ -4188,11 +3842,7 @@ poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vext_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 2>
 // CHECK-NEXT:    ret <2 x i32> [[VEXT]]
 //
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
@@ -4202,11 +3852,7 @@ int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vext_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 2>
 // CHECK-NEXT:    ret <2 x i32> [[VEXT]]
 //
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
@@ -4216,12 +3862,7 @@ uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vext_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[VEXT]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
 //
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
@@ -4230,12 +3871,7 @@ int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vext_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[VEXT]]
+// CHECK-NEXT:    ret <1 x i64> [[A]]
 //
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
@@ -4244,11 +3880,7 @@ uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
 // CHECK-LABEL: define <2 x float> @test_vext_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 2>
 // CHECK-NEXT:    ret <2 x float> [[VEXT]]
 //
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
@@ -4288,11 +3920,7 @@ poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vextq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK-NEXT:    ret <8 x i16> [[VEXT]]
 //
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
@@ -4302,11 +3930,7 @@ int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vextq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK-NEXT:    ret <8 x i16> [[VEXT]]
 //
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
@@ -4316,11 +3940,7 @@ uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vextq_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 // CHECK-NEXT:    ret <8 x i16> [[VEXT]]
 //
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
@@ -4330,11 +3950,7 @@ poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vextq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x i32> [[VEXT]]
 //
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
@@ -4344,11 +3960,7 @@ int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vextq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x i32> [[VEXT]]
 //
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
@@ -4358,11 +3970,7 @@ uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vextq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 2>
 // CHECK-NEXT:    ret <2 x i64> [[VEXT]]
 //
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
@@ -4372,11 +3980,7 @@ int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vextq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 2>
 // CHECK-NEXT:    ret <2 x i64> [[VEXT]]
 //
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
@@ -4386,11 +3990,7 @@ uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <4 x float> @test_vextq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 // CHECK-NEXT:    ret <4 x float> [[VEXT]]
 //
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
@@ -4400,11 +4000,8 @@ float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define <2 x float> @test_vfma_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[C]], <2 x float> [[A]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[C]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
@@ -4413,11 +4010,8 @@ float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
 // CHECK-LABEL: define <4 x float> @test_vfmaq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[C]], <4 x float> [[A]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[C]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
@@ -4427,11 +4021,8 @@ float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x float> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[C]], <2 x float> [[A]])
-// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[C]], <2 x float> [[A]])
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
 //
 float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfms_f32(a, b, c);
@@ -4441,11 +4032,8 @@ float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x float> [[B]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[C]], <4 x float> [[A]])
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[C]], <4 x float> [[A]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmsq_f32(a, b, c);
@@ -4454,7 +4042,7 @@ float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vget_high_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 int8x8_t test_vget_high_s8(int8x16_t a) {
@@ -4464,7 +4052,7 @@ int8x8_t test_vget_high_s8(int8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_high_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 int16x4_t test_vget_high_s16(int16x8_t a) {
@@ -4474,7 +4062,7 @@ int16x4_t test_vget_high_s16(int16x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vget_high_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 int32x2_t test_vget_high_s32(int32x4_t a) {
@@ -4484,7 +4072,7 @@ int32x2_t test_vget_high_s32(int32x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vget_high_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> <i32 1>
 // CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
 //
 int64x1_t test_vget_high_s64(int64x2_t a) {
@@ -4494,7 +4082,7 @@ int64x1_t test_vget_high_s64(int64x2_t a) {
 // CHECK-LABEL: define <4 x half> @test_vget_high_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
 //
 float16x4_t test_vget_high_f16(float16x8_t a) {
@@ -4504,7 +4092,7 @@ float16x4_t test_vget_high_f16(float16x8_t a) {
 // CHECK-LABEL: define <2 x float> @test_vget_high_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
 //
 float32x2_t test_vget_high_f32(float32x4_t a) {
@@ -4514,7 +4102,7 @@ float32x2_t test_vget_high_f32(float32x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vget_high_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
@@ -4524,7 +4112,7 @@ uint8x8_t test_vget_high_u8(uint8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_high_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
@@ -4534,7 +4122,7 @@ uint16x4_t test_vget_high_u16(uint16x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vget_high_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
@@ -4544,7 +4132,7 @@ uint32x2_t test_vget_high_u32(uint32x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vget_high_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> <i32 1>
 // CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
 //
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
@@ -4554,7 +4142,7 @@ uint64x1_t test_vget_high_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vget_high_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
@@ -4564,7 +4152,7 @@ poly8x8_t test_vget_high_p8(poly8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_high_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
@@ -4574,7 +4162,7 @@ poly16x4_t test_vget_high_p16(poly16x8_t a) {
 // CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 uint8_t test_vget_lane_u8(uint8x8_t a) {
@@ -4584,7 +4172,7 @@ uint8_t test_vget_lane_u8(uint8x8_t a) {
 // CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 uint16_t test_vget_lane_u16(uint16x4_t a) {
@@ -4594,7 +4182,7 @@ uint16_t test_vget_lane_u16(uint16x4_t a) {
 // CHECK-LABEL: define i32 @test_vget_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
 // CHECK-NEXT:    ret i32 [[VGET_LANE]]
 //
 uint32_t test_vget_lane_u32(uint32x2_t a) {
@@ -4604,7 +4192,7 @@ uint32_t test_vget_lane_u32(uint32x2_t a) {
 // CHECK-LABEL: define signext i8 @test_vget_lane_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 int8_t test_vget_lane_s8(int8x8_t a) {
@@ -4614,7 +4202,7 @@ int8_t test_vget_lane_s8(int8x8_t a) {
 // CHECK-LABEL: define signext i16 @test_vget_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 int16_t test_vget_lane_s16(int16x4_t a) {
@@ -4624,7 +4212,7 @@ int16_t test_vget_lane_s16(int16x4_t a) {
 // CHECK-LABEL: define i32 @test_vget_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
 // CHECK-NEXT:    ret i32 [[VGET_LANE]]
 //
 int32_t test_vget_lane_s32(int32x2_t a) {
@@ -4634,7 +4222,7 @@ int32_t test_vget_lane_s32(int32x2_t a) {
 // CHECK-LABEL: define signext i8 @test_vget_lane_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 poly8_t test_vget_lane_p8(poly8x8_t a) {
@@ -4644,7 +4232,7 @@ poly8_t test_vget_lane_p8(poly8x8_t a) {
 // CHECK-LABEL: define signext i16 @test_vget_lane_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 poly16_t test_vget_lane_p16(poly16x4_t a) {
@@ -4654,7 +4242,7 @@ poly16_t test_vget_lane_p16(poly16x4_t a) {
 // CHECK-LABEL: define float @test_vget_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[A]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[A]], i64 1
 // CHECK-NEXT:    ret float [[VGET_LANE]]
 //
 float32_t test_vget_lane_f32(float32x2_t a) {
@@ -4664,14 +4252,8 @@ float32_t test_vget_lane_f32(float32x2_t a) {
 // CHECK-LABEL: define float @test_vget_lane_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__REINT_831:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT1_831:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <4 x half> [[A]], ptr [[__REINT_831]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_831]], align 8
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_831]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_831]], align 2
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP1]] to float
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[A]], i64 1
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP0]] to float
 // CHECK-NEXT:    ret float [[CONV]]
 //
 float32_t test_vget_lane_f16(float16x4_t a) {
@@ -4681,7 +4263,7 @@ float32_t test_vget_lane_f16(float16x4_t a) {
 // CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
@@ -4691,7 +4273,7 @@ uint8_t test_vgetq_lane_u8(uint8x16_t a) {
 // CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
@@ -4701,7 +4283,7 @@ uint16_t test_vgetq_lane_u16(uint16x8_t a) {
 // CHECK-LABEL: define i32 @test_vgetq_lane_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
 // CHECK-NEXT:    ret i32 [[VGET_LANE]]
 //
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
@@ -4711,7 +4293,7 @@ uint32_t test_vgetq_lane_u32(uint32x4_t a) {
 // CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 int8_t test_vgetq_lane_s8(int8x16_t a) {
@@ -4721,7 +4303,7 @@ int8_t test_vgetq_lane_s8(int8x16_t a) {
 // CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 int16_t test_vgetq_lane_s16(int16x8_t a) {
@@ -4731,7 +4313,7 @@ int16_t test_vgetq_lane_s16(int16x8_t a) {
 // CHECK-LABEL: define i32 @test_vgetq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
 // CHECK-NEXT:    ret i32 [[VGET_LANE]]
 //
 int32_t test_vgetq_lane_s32(int32x4_t a) {
@@ -4741,7 +4323,7 @@ int32_t test_vgetq_lane_s32(int32x4_t a) {
 // CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15
 // CHECK-NEXT:    ret i8 [[VGET_LANE]]
 //
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
@@ -4751,7 +4333,7 @@ poly8_t test_vgetq_lane_p8(poly8x16_t a) {
 // CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7
 // CHECK-NEXT:    ret i16 [[VGET_LANE]]
 //
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
@@ -4761,7 +4343,7 @@ poly16_t test_vgetq_lane_p16(poly16x8_t a) {
 // CHECK-LABEL: define float @test_vgetq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x float> [[A]], i32 3
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x float> [[A]], i64 3
 // CHECK-NEXT:    ret float [[VGET_LANE]]
 //
 float32_t test_vgetq_lane_f32(float32x4_t a) {
@@ -4771,14 +4353,8 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {
 // CHECK-LABEL: define float @test_vgetq_lane_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__REINT_834:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT1_834:%.*]] = alloca i16, align 2
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[__REINT_834]], align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_834]], align 16
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3
-// CHECK-NEXT:    store i16 [[VGET_LANE]], ptr [[__REINT1_834]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__REINT1_834]], align 2
-// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP1]] to float
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[A]], i64 3
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[TMP0]] to float
 // CHECK-NEXT:    ret float [[CONV]]
 //
 float32_t test_vgetq_lane_f16(float16x8_t a) {
@@ -4788,7 +4364,7 @@ float32_t test_vgetq_lane_f16(float16x8_t a) {
 // CHECK-LABEL: define i64 @test_vget_lane_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i64 0
 // CHECK-NEXT:    ret i64 [[VGET_LANE]]
 //
 int64_t test_vget_lane_s64(int64x1_t a) {
@@ -4798,7 +4374,7 @@ int64_t test_vget_lane_s64(int64x1_t a) {
 // CHECK-LABEL: define i64 @test_vget_lane_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i64 0
 // CHECK-NEXT:    ret i64 [[VGET_LANE]]
 //
 uint64_t test_vget_lane_u64(uint64x1_t a) {
@@ -4808,7 +4384,7 @@ uint64_t test_vget_lane_u64(uint64x1_t a) {
 // CHECK-LABEL: define i64 @test_vgetq_lane_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
 // CHECK-NEXT:    ret i64 [[VGET_LANE]]
 //
 int64_t test_vgetq_lane_s64(int64x2_t a) {
@@ -4818,7 +4394,7 @@ int64_t test_vgetq_lane_s64(int64x2_t a) {
 // CHECK-LABEL: define i64 @test_vgetq_lane_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
 // CHECK-NEXT:    ret i64 [[VGET_LANE]]
 //
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
@@ -4828,7 +4404,7 @@ uint64_t test_vgetq_lane_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vget_low_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 int8x8_t test_vget_low_s8(int8x16_t a) {
@@ -4838,7 +4414,7 @@ int8x8_t test_vget_low_s8(int8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_low_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 int16x4_t test_vget_low_s16(int16x8_t a) {
@@ -4848,7 +4424,7 @@ int16x4_t test_vget_low_s16(int16x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vget_low_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 int32x2_t test_vget_low_s32(int32x4_t a) {
@@ -4858,7 +4434,7 @@ int32x2_t test_vget_low_s32(int32x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vget_low_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> zeroinitializer
 // CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
 //
 int64x1_t test_vget_low_s64(int64x2_t a) {
@@ -4868,7 +4444,7 @@ int64x1_t test_vget_low_s64(int64x2_t a) {
 // CHECK-LABEL: define <4 x half> @test_vget_low_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
 //
 float16x4_t test_vget_low_f16(float16x8_t a) {
@@ -4878,7 +4454,7 @@ float16x4_t test_vget_low_f16(float16x8_t a) {
 // CHECK-LABEL: define <2 x float> @test_vget_low_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
 //
 float32x2_t test_vget_low_f32(float32x4_t a) {
@@ -4888,7 +4464,7 @@ float32x2_t test_vget_low_f32(float32x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vget_low_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
@@ -4898,7 +4474,7 @@ uint8x8_t test_vget_low_u8(uint8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_low_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
@@ -4908,7 +4484,7 @@ uint16x4_t test_vget_low_u16(uint16x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vget_low_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
@@ -4918,7 +4494,7 @@ uint32x2_t test_vget_low_u32(uint32x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vget_low_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> zeroinitializer
 // CHECK-NEXT:    ret <1 x i64> [[SHUFFLE_I]]
 //
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
@@ -4928,7 +4504,7 @@ uint64x1_t test_vget_low_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vget_low_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
@@ -4938,7 +4514,7 @@ poly8x8_t test_vget_low_p8(poly8x16_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vget_low_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
@@ -4958,10 +4534,7 @@ int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vhadd_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
 //
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
@@ -4971,10 +4544,7 @@ int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vhadd_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
 //
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
@@ -4994,10 +4564,7 @@ uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vhadd_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VHADD_V2_I]]
 //
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
@@ -5007,10 +4574,7 @@ uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vhadd_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VHADD_V2_I]]
 //
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
@@ -5030,10 +4594,7 @@ int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
 //
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
@@ -5043,10 +4604,7 @@ int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
 //
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
@@ -5066,10 +4624,7 @@ uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VHADDQ_V2_I]]
 //
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
@@ -5079,10 +4634,7 @@ uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VHADDQ_V2_I]]
 //
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
@@ -5102,10 +4654,7 @@ int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vhsub_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
 //
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
@@ -5115,10 +4664,7 @@ int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vhsub_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
 //
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
@@ -5138,10 +4684,7 @@ uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vhsub_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VHSUB_V2_I]]
 //
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
@@ -5151,10 +4694,7 @@ uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vhsub_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VHSUB_V2_I]]
 //
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
@@ -5174,10 +4714,7 @@ int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
 //
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
@@ -5187,10 +4724,7 @@ int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
 //
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
@@ -5210,10 +4744,7 @@ uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VHSUBQ_V2_I]]
 //
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
@@ -5223,10 +4754,7 @@ uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VHSUBQ_V2_I]]
 //
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
@@ -5236,7 +4764,7 @@ uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <16 x i8> @test_vld1q_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <16 x i8> [[VLD1]]
 //
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
@@ -5246,7 +4774,7 @@ uint8x16_t test_vld1q_u8(uint8_t const * a) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <8 x i16> [[VLD1]]
 //
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
@@ -5256,7 +4784,7 @@ uint16x8_t test_vld1q_u16(uint16_t const * a) {
 // CHECK-LABEL: define <4 x i32> @test_vld1q_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x i32>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <4 x i32> [[VLD1]]
 //
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
@@ -5266,7 +4794,7 @@ uint32x4_t test_vld1q_u32(uint32_t const * a) {
 // CHECK-LABEL: define <2 x i64> @test_vld1q_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <2 x i64>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <2 x i64> [[VLD1]]
 //
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
@@ -5276,7 +4804,7 @@ uint64x2_t test_vld1q_u64(uint64_t const * a) {
 // CHECK-LABEL: define <16 x i8> @test_vld1q_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <16 x i8> [[VLD1]]
 //
 int8x16_t test_vld1q_s8(int8_t const * a) {
@@ -5286,7 +4814,7 @@ int8x16_t test_vld1q_s8(int8_t const * a) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <8 x i16> [[VLD1]]
 //
 int16x8_t test_vld1q_s16(int16_t const * a) {
@@ -5296,7 +4824,7 @@ int16x8_t test_vld1q_s16(int16_t const * a) {
 // CHECK-LABEL: define <4 x i32> @test_vld1q_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x i32>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <4 x i32> [[VLD1]]
 //
 int32x4_t test_vld1q_s32(int32_t const * a) {
@@ -5306,7 +4834,7 @@ int32x4_t test_vld1q_s32(int32_t const * a) {
 // CHECK-LABEL: define <2 x i64> @test_vld1q_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <2 x i64>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <2 x i64> [[VLD1]]
 //
 int64x2_t test_vld1q_s64(int64_t const * a) {
@@ -5316,7 +4844,7 @@ int64x2_t test_vld1q_s64(int64_t const * a) {
 // CHECK-LABEL: define <8 x half> @test_vld1q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x half>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <8 x half> [[VLD1]]
 //
 float16x8_t test_vld1q_f16(float16_t const * a) {
@@ -5326,7 +4854,7 @@ float16x8_t test_vld1q_f16(float16_t const * a) {
 // CHECK-LABEL: define <4 x float> @test_vld1q_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x float>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <4 x float> [[VLD1]]
 //
 float32x4_t test_vld1q_f32(float32_t const * a) {
@@ -5336,7 +4864,7 @@ float32x4_t test_vld1q_f32(float32_t const * a) {
 // CHECK-LABEL: define <16 x i8> @test_vld1q_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <16 x i8> [[VLD1]]
 //
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
@@ -5346,7 +4874,7 @@ poly8x16_t test_vld1q_p8(poly8_t const * a) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <8 x i16> [[VLD1]]
 //
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
@@ -5356,7 +4884,7 @@ poly16x8_t test_vld1q_p16(poly16_t const * a) {
 // CHECK-LABEL: define <8 x i8> @test_vld1_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <8 x i8> [[VLD1]]
 //
 uint8x8_t test_vld1_u8(uint8_t const * a) {
@@ -5366,7 +4894,7 @@ uint8x8_t test_vld1_u8(uint8_t const * a) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <4 x i16> [[VLD1]]
 //
 uint16x4_t test_vld1_u16(uint16_t const * a) {
@@ -5376,7 +4904,7 @@ uint16x4_t test_vld1_u16(uint16_t const * a) {
 // CHECK-LABEL: define <2 x i32> @test_vld1_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <2 x i32>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <2 x i32> [[VLD1]]
 //
 uint32x2_t test_vld1_u32(uint32_t const * a) {
@@ -5386,7 +4914,7 @@ uint32x2_t test_vld1_u32(uint32_t const * a) {
 // CHECK-LABEL: define <1 x i64> @test_vld1_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <1 x i64>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <1 x i64> [[VLD1]]
 //
 uint64x1_t test_vld1_u64(uint64_t const * a) {
@@ -5396,7 +4924,7 @@ uint64x1_t test_vld1_u64(uint64_t const * a) {
 // CHECK-LABEL: define <8 x i8> @test_vld1_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <8 x i8> [[VLD1]]
 //
 int8x8_t test_vld1_s8(int8_t const * a) {
@@ -5406,7 +4934,7 @@ int8x8_t test_vld1_s8(int8_t const * a) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <4 x i16> [[VLD1]]
 //
 int16x4_t test_vld1_s16(int16_t const * a) {
@@ -5416,7 +4944,7 @@ int16x4_t test_vld1_s16(int16_t const * a) {
 // CHECK-LABEL: define <2 x i32> @test_vld1_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <2 x i32>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <2 x i32> [[VLD1]]
 //
 int32x2_t test_vld1_s32(int32_t const * a) {
@@ -5426,7 +4954,7 @@ int32x2_t test_vld1_s32(int32_t const * a) {
 // CHECK-LABEL: define <1 x i64> @test_vld1_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <1 x i64>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <1 x i64> [[VLD1]]
 //
 int64x1_t test_vld1_s64(int64_t const * a) {
@@ -5436,7 +4964,7 @@ int64x1_t test_vld1_s64(int64_t const * a) {
 // CHECK-LABEL: define <4 x half> @test_vld1_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x half>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <4 x half> [[VLD1]]
 //
 float16x4_t test_vld1_f16(float16_t const * a) {
@@ -5446,7 +4974,7 @@ float16x4_t test_vld1_f16(float16_t const * a) {
 // CHECK-LABEL: define <2 x float> @test_vld1_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr [[A]], i32 4)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <2 x float>, ptr [[A]], align 4
 // CHECK-NEXT:    ret <2 x float> [[VLD1]]
 //
 float32x2_t test_vld1_f32(float32_t const * a) {
@@ -5456,7 +4984,7 @@ float32x2_t test_vld1_f32(float32_t const * a) {
 // CHECK-LABEL: define <8 x i8> @test_vld1_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1
 // CHECK-NEXT:    ret <8 x i8> [[VLD1]]
 //
 poly8x8_t test_vld1_p8(poly8_t const * a) {
@@ -5466,7 +4994,7 @@ poly8x8_t test_vld1_p8(poly8_t const * a) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2)
+// CHECK-NEXT:    [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2
 // CHECK-NEXT:    ret <4 x i16> [[VLD1]]
 //
 poly16x4_t test_vld1_p16(poly16_t const * a) {
@@ -5477,8 +5005,8 @@ poly16x4_t test_vld1_p16(poly16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
@@ -5489,8 +5017,8 @@ uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
@@ -5501,8 +5029,8 @@ uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[LANE]]
 //
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
@@ -5513,8 +5041,8 @@ uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[LANE]]
 //
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
@@ -5525,8 +5053,8 @@ uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
@@ -5537,8 +5065,8 @@ int8x16_t test_vld1q_dup_s8(int8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
@@ -5549,8 +5077,8 @@ int16x8_t test_vld1q_dup_s16(int16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[LANE]]
 //
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
@@ -5561,8 +5089,8 @@ int32x4_t test_vld1q_dup_s32(int32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[LANE]]
 //
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
@@ -5573,8 +5101,8 @@ int64x2_t test_vld1q_dup_s64(int64_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[LANE]]
 //
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
@@ -5585,8 +5113,8 @@ float16x8_t test_vld1q_dup_f16(float16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x float> [[LANE]]
 //
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
@@ -5597,8 +5125,8 @@ float32x4_t test_vld1q_dup_f32(float32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[LANE]]
 //
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
@@ -5609,8 +5137,8 @@ poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[LANE]]
 //
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
@@ -5621,8 +5149,8 @@ poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
@@ -5633,8 +5161,8 @@ uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
@@ -5645,8 +5173,8 @@ uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[LANE]]
 //
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
@@ -5657,9 +5185,8 @@ uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP1]]
 //
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
@@ -5669,8 +5196,8 @@ uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
@@ -5681,8 +5208,8 @@ int8x8_t test_vld1_dup_s8(int8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
@@ -5693,8 +5220,8 @@ int16x4_t test_vld1_dup_s16(int16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[LANE]]
 //
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
@@ -5705,9 +5232,8 @@ int32x2_t test_vld1_dup_s32(int32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[TMP1]]
 //
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
@@ -5717,8 +5243,8 @@ int64x1_t test_vld1_dup_s64(int64_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x half> [[LANE]]
 //
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
@@ -5729,8 +5255,8 @@ float16x4_t test_vld1_dup_f16(float16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x float> [[LANE]]
 //
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
@@ -5741,8 +5267,8 @@ float32x2_t test_vld1_dup_f32(float32_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[LANE]]
 //
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
@@ -5753,8 +5279,8 @@ poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[LANE]]
 //
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
@@ -5765,7 +5291,7 @@ poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
 //
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
@@ -5775,10 +5301,8 @@ uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
 //
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
@@ -5788,10 +5312,8 @@ uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
 //
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
@@ -5801,11 +5323,9 @@ uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    ret <2 x i64> [[VLD1Q_LANE]]
 //
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
@@ -5816,7 +5336,7 @@ uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
 //
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
@@ -5826,10 +5346,8 @@ int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
 //
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
@@ -5839,10 +5357,8 @@ int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
 //
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
@@ -5852,11 +5368,9 @@ int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <2 x i32> <i32 0, i32 1>
 // CHECK-NEXT:    ret <2 x i64> [[VLD1Q_LANE]]
 //
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
@@ -5866,10 +5380,8 @@ int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
 // CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP1]], half [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x half> [[B]], half [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x half> [[VLD1_LANE]]
 //
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
@@ -5879,10 +5391,8 @@ float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
 // CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x float> [[B]], float [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x float> [[VLD1_LANE]]
 //
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
@@ -5893,7 +5403,7 @@ float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
 //
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
@@ -5903,10 +5413,8 @@ poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
 //
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
@@ -5917,7 +5425,7 @@ poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
 //
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
@@ -5927,10 +5435,8 @@ uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
 //
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
@@ -5940,10 +5446,8 @@ uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
 //
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
@@ -5953,10 +5457,8 @@ uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
 //
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
@@ -5967,7 +5469,7 @@ uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
 //
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
@@ -5977,10 +5479,8 @@ int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
 //
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
@@ -5990,10 +5490,8 @@ int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
 //
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
@@ -6003,10 +5501,8 @@ int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
 //
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
@@ -6016,10 +5512,8 @@ int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
 // CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP1]], half [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x half> [[B]], half [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x half> [[VLD1_LANE]]
 //
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
@@ -6029,10 +5523,8 @@ float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
 // CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[A]], align 4
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x float> [[B]], float [[TMP0]], i64 1
 // CHECK-NEXT:    ret <2 x float> [[VLD1_LANE]]
 //
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
@@ -6043,7 +5535,7 @@ float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
 //
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
@@ -6053,10 +5545,8 @@ poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
-// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
 //
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
@@ -6066,10 +5556,12 @@ poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vld2q_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
@@ -6079,10 +5571,12 @@ uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
@@ -6092,10 +5586,12 @@ uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
@@ -6105,10 +5601,12 @@ uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
@@ -6118,10 +5616,12 @@ int8x16x2_t test_vld2q_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
@@ -6131,10 +5631,12 @@ int16x8x2_t test_vld2q_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
@@ -6144,10 +5646,12 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2.v8f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
@@ -6157,10 +5661,12 @@ float16x8x2_t test_vld2q_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
@@ -6170,10 +5676,12 @@ float32x4x2_t test_vld2q_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
@@ -6183,10 +5691,12 @@ poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
 // CHECK-NEXT:    [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0
+// CHECK-NEXT:    [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
@@ -6196,10 +5706,12 @@ poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld2_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
@@ -6209,10 +5721,12 @@ uint8x8x2_t test_vld2_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld2_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
@@ -6222,10 +5736,12 @@ uint16x4x2_t test_vld2_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld2_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
@@ -6235,10 +5751,12 @@ uint32x2x2_t test_vld2_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld2_u64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
@@ -6248,10 +5766,12 @@ uint64x1x2_t test_vld2_u64(uint64_t const * a) {
 // CHECK-LABEL: define void @test_vld2_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x2_t test_vld2_s8(int8_t const * a) {
@@ -6261,10 +5781,12 @@ int8x8x2_t test_vld2_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld2_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x2_t test_vld2_s16(int16_t const * a) {
@@ -6274,10 +5796,12 @@ int16x4x2_t test_vld2_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld2_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x2_t test_vld2_s32(int32_t const * a) {
@@ -6287,10 +5811,12 @@ int32x2x2_t test_vld2_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld2_s64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int64x1x2_t test_vld2_s64(int64_t const * a) {
@@ -6300,10 +5826,12 @@ int64x1x2_t test_vld2_s64(int64_t const * a) {
 // CHECK-LABEL: define void @test_vld2_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2.v4f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x2_t test_vld2_f16(float16_t const * a) {
@@ -6313,10 +5841,12 @@ float16x4x2_t test_vld2_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld2_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x2_t test_vld2_f32(float32_t const * a) {
@@ -6326,10 +5856,12 @@ float32x2x2_t test_vld2_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld2_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
@@ -6339,10 +5871,12 @@ poly8x8x2_t test_vld2_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld2_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
 // CHECK-NEXT:    [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0
+// CHECK-NEXT:    [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
@@ -6352,25 +5886,22 @@ poly16x4x2_t test_vld2_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld2q_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
@@ -6380,25 +5911,22 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
@@ -6408,25 +5936,22 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
@@ -6436,25 +5961,22 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
@@ -6464,25 +5986,22 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <8 x half> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
@@ -6492,25 +6011,22 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <4 x float> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
@@ -6520,25 +6036,22 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2q_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
@@ -6548,21 +6061,16 @@ poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
@@ -6572,25 +6080,16 @@ uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
@@ -6600,25 +6099,16 @@ uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
@@ -6628,21 +6118,16 @@ uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
@@ -6652,25 +6137,16 @@ int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
@@ -6680,25 +6156,16 @@ int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
@@ -6708,25 +6175,16 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <4 x half> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
@@ -6736,25 +6194,16 @@ float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <2 x float> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
@@ -6764,21 +6213,16 @@ float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
@@ -6788,25 +6232,16 @@ poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
 // CHECK-LABEL: define void @test_vld2_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 16, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0
+// CHECK-NEXT:    [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
@@ -6816,10 +6251,15 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
 // CHECK-LABEL: define void @test_vld3q_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
@@ -6829,10 +6269,15 @@ uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
@@ -6842,10 +6287,15 @@ uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
@@ -6855,10 +6305,15 @@ uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
@@ -6868,10 +6323,15 @@ int8x16x3_t test_vld3q_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
@@ -6881,10 +6341,15 @@ int16x8x3_t test_vld3q_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
@@ -6894,10 +6359,15 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3.v8f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
@@ -6907,10 +6377,15 @@ float16x8x3_t test_vld3q_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
@@ -6920,10 +6395,15 @@ float32x4x3_t test_vld3q_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
@@ -6933,10 +6413,15 @@ poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
 // CHECK-NEXT:    [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0
+// CHECK-NEXT:    [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1
+// CHECK-NEXT:    [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
@@ -6946,10 +6431,15 @@ poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld3_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
@@ -6959,10 +6449,15 @@ uint8x8x3_t test_vld3_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld3_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
@@ -6972,10 +6467,15 @@ uint16x4x3_t test_vld3_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld3_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
@@ -6985,10 +6485,15 @@ uint32x2x3_t test_vld3_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld3_u64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
@@ -6998,10 +6503,15 @@ uint64x1x3_t test_vld3_u64(uint64_t const * a) {
 // CHECK-LABEL: define void @test_vld3_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x3_t test_vld3_s8(int8_t const * a) {
@@ -7011,10 +6521,15 @@ int8x8x3_t test_vld3_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld3_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x3_t test_vld3_s16(int16_t const * a) {
@@ -7024,10 +6539,15 @@ int16x4x3_t test_vld3_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld3_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x3_t test_vld3_s32(int32_t const * a) {
@@ -7037,10 +6557,15 @@ int32x2x3_t test_vld3_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld3_s64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int64x1x3_t test_vld3_s64(int64_t const * a) {
@@ -7050,10 +6575,15 @@ int64x1x3_t test_vld3_s64(int64_t const * a) {
 // CHECK-LABEL: define void @test_vld3_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3.v4f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x half> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x3_t test_vld3_f16(float16_t const * a) {
@@ -7063,10 +6593,15 @@ float16x4x3_t test_vld3_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld3_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x float> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x3_t test_vld3_f32(float32_t const * a) {
@@ -7076,10 +6611,15 @@ float32x2x3_t test_vld3_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld3_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
@@ -7089,10 +6629,15 @@ poly8x8x3_t test_vld3_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld3_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
 // CHECK-NEXT:    [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0
+// CHECK-NEXT:    [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1
+// CHECK-NEXT:    [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
@@ -7102,30 +6647,30 @@ poly16x4x3_t test_vld3_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld3q_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
@@ -7135,30 +6680,30 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
@@ -7168,30 +6713,30 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
@@ -7201,30 +6746,30 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
@@ -7234,30 +6779,30 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x half> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
@@ -7267,30 +6812,30 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x float> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
@@ -7300,30 +6845,30 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3q_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 48, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
@@ -7333,24 +6878,21 @@ poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
@@ -7360,30 +6902,21 @@ uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
@@ -7393,30 +6926,21 @@ uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
@@ -7426,24 +6950,21 @@ uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
@@ -7453,30 +6974,21 @@ int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
@@ -7486,30 +6998,21 @@ int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
@@ -7519,30 +7022,21 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x half> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
@@ -7552,30 +7046,21 @@ float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x float> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
@@ -7585,24 +7070,21 @@ float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
@@ -7612,30 +7094,21 @@ poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
 // CHECK-LABEL: define void @test_vld3_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 24, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1
+// CHECK-NEXT:    [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
@@ -7645,10 +7118,18 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
 // CHECK-LABEL: define void @test_vld4q_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
@@ -7658,10 +7139,18 @@ uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
@@ -7671,10 +7160,18 @@ uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
@@ -7684,10 +7181,18 @@ uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
@@ -7697,10 +7202,18 @@ int8x16x4_t test_vld4q_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
@@ -7710,10 +7223,18 @@ int16x8x4_t test_vld4q_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
@@ -7723,10 +7244,18 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4.v8f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
@@ -7736,10 +7265,18 @@ float16x8x4_t test_vld4q_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
@@ -7749,10 +7286,18 @@ float32x4x4_t test_vld4q_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
@@ -7762,10 +7307,18 @@ poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
 // CHECK-NEXT:    [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0
+// CHECK-NEXT:    [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1
+// CHECK-NEXT:    [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2
+// CHECK-NEXT:    [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
@@ -7775,10 +7328,18 @@ poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld4_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
@@ -7788,10 +7349,18 @@ uint8x8x4_t test_vld4_u8(uint8_t const * a) {
 // CHECK-LABEL: define void @test_vld4_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
@@ -7801,10 +7370,18 @@ uint16x4x4_t test_vld4_u16(uint16_t const * a) {
 // CHECK-LABEL: define void @test_vld4_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
@@ -7814,10 +7391,18 @@ uint32x2x4_t test_vld4_u32(uint32_t const * a) {
 // CHECK-LABEL: define void @test_vld4_u64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
@@ -7827,10 +7412,18 @@ uint64x1x4_t test_vld4_u64(uint64_t const * a) {
 // CHECK-LABEL: define void @test_vld4_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x4_t test_vld4_s8(int8_t const * a) {
@@ -7840,10 +7433,18 @@ int8x8x4_t test_vld4_s8(int8_t const * a) {
 // CHECK-LABEL: define void @test_vld4_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x4_t test_vld4_s16(int16_t const * a) {
@@ -7853,10 +7454,18 @@ int16x4x4_t test_vld4_s16(int16_t const * a) {
 // CHECK-LABEL: define void @test_vld4_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x4_t test_vld4_s32(int32_t const * a) {
@@ -7866,10 +7475,18 @@ int32x2x4_t test_vld4_s32(int32_t const * a) {
 // CHECK-LABEL: define void @test_vld4_s64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int64x1x4_t test_vld4_s64(int64_t const * a) {
@@ -7879,10 +7496,18 @@ int64x1x4_t test_vld4_s64(int64_t const * a) {
 // CHECK-LABEL: define void @test_vld4_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4.v4f16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <4 x half> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x half> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x half> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x4_t test_vld4_f16(float16_t const * a) {
@@ -7892,10 +7517,18 @@ float16x4x4_t test_vld4_f16(float16_t const * a) {
 // CHECK-LABEL: define void @test_vld4_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0(ptr [[A]], i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <2 x float> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x float> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x float> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x4_t test_vld4_f32(float32_t const * a) {
@@ -7905,10 +7538,18 @@ float32x2x4_t test_vld4_f32(float32_t const * a) {
 // CHECK-LABEL: define void @test_vld4_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
@@ -7918,10 +7559,18 @@ poly8x8x4_t test_vld4_p8(poly8_t const * a) {
 // CHECK-LABEL: define void @test_vld4_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
 // CHECK-NEXT:    [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0
+// CHECK-NEXT:    [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1
+// CHECK-NEXT:    [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2
+// CHECK-NEXT:    [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
@@ -7931,35 +7580,38 @@ poly16x4x4_t test_vld4_p16(poly16_t const * a) {
 // CHECK-LABEL: define void @test_vld4q_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
@@ -7969,35 +7621,38 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
@@ -8007,35 +7662,38 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
@@ -8045,35 +7703,38 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
@@ -8083,35 +7744,38 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x half> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
@@ -8121,35 +7785,38 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 3, i32 4)
-// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <4 x float> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
@@ -8159,35 +7826,38 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4q_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
-// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[AGG_RESULT]], ptr align 16 [[__RET]], i32 64, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48
+// CHECK-NEXT:    store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16
 // CHECK-NEXT:    ret void
 //
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
@@ -8197,27 +7867,26 @@ poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
@@ -8227,35 +7896,26 @@ uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
@@ -8265,35 +7925,26 @@ uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
@@ -8303,27 +7954,26 @@ uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
@@ -8333,35 +7983,26 @@ int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
@@ -8371,35 +8012,26 @@ int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
@@ -8409,35 +8041,26 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x half> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
@@ -8447,35 +8070,26 @@ float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4)
-// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 1, i32 4)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <2 x float> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
@@ -8485,27 +8099,26 @@ float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
-// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
@@ -8515,35 +8128,26 @@ poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
 // CHECK-LABEL: define void @test_vld4_lane_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
-// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], ptr [[__RET]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT]], ptr align 8 [[__RET]], i32 32, i1 false)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2
+// CHECK-NEXT:    [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8
+// CHECK-NEXT:    [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8
+// CHECK-NEXT:    [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24
+// CHECK-NEXT:    store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8
 // CHECK-NEXT:    ret void
 //
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
@@ -8563,10 +8167,7 @@ int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmax_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VMAX_V2_I]]
 //
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
@@ -8576,10 +8177,7 @@ int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmax_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VMAX_V2_I]]
 //
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
@@ -8599,10 +8197,7 @@ uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmax_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VMAX_V2_I]]
 //
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
@@ -8612,10 +8207,7 @@ uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmax_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VMAX_V2_I]]
 //
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
@@ -8625,10 +8217,7 @@ uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vmax_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMAX_V2_I]]
 //
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
@@ -8648,10 +8237,7 @@ int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VMAXQ_V2_I]]
 //
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
@@ -8661,10 +8247,7 @@ int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VMAXQ_V2_I]]
 //
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
@@ -8684,10 +8267,7 @@ uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VMAXQ_V2_I]]
 //
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
@@ -8697,10 +8277,7 @@ uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VMAXQ_V2_I]]
 //
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
@@ -8710,10 +8287,7 @@ uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMAXQ_V2_I]]
 //
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
@@ -8733,10 +8307,7 @@ int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmin_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VMIN_V2_I]]
 //
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
@@ -8746,10 +8317,7 @@ int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmin_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VMIN_V2_I]]
 //
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
@@ -8769,10 +8337,7 @@ uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmin_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VMIN_V2_I]]
 //
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
@@ -8782,10 +8347,7 @@ uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmin_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VMIN_V2_I]]
 //
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
@@ -8795,10 +8357,7 @@ uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vmin_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMIN_V2_I]]
 //
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
@@ -8818,10 +8377,7 @@ int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vminq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VMINQ_V2_I]]
 //
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
@@ -8831,10 +8387,7 @@ int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vminq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VMINQ_V2_I]]
 //
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
@@ -8854,10 +8407,7 @@ uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vminq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VMINQ_V2_I]]
 //
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
@@ -8867,10 +8417,7 @@ uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vminq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VMINQ_V2_I]]
 //
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
@@ -8880,10 +8427,7 @@ uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <4 x float> @test_vminq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMINQ_V2_I]]
 //
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
@@ -9058,8 +8602,6 @@ int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9071,8 +8613,6 @@ int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -9095,8 +8635,6 @@ uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9108,8 +8646,6 @@ uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -9121,11 +8657,7 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -9137,11 +8669,7 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -9153,11 +8681,7 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -9169,11 +8693,7 @@ uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD]]
@@ -9185,12 +8705,8 @@ uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9202,10 +8718,8 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -9217,12 +8731,8 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9234,10 +8744,8 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
@@ -9249,9 +8757,7 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -9263,9 +8769,7 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -9277,9 +8781,7 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD]]
@@ -9291,9 +8793,7 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD]]
@@ -9305,9 +8805,7 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[ADD]]
@@ -9319,9 +8817,7 @@ float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -9333,9 +8829,7 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -9347,9 +8841,7 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD]]
@@ -9361,9 +8853,7 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD]]
@@ -9375,9 +8865,7 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[ADD]]
@@ -9389,10 +8877,8 @@ float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
@@ -9404,8 +8890,8 @@ int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
@@ -9417,10 +8903,8 @@ int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[ADD_I]]
@@ -9432,8 +8916,8 @@ uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[ADD_I]]
@@ -9445,8 +8929,8 @@ uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-LABEL: define <2 x float> @test_vmla_n_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x float> [[ADD_I]]
@@ -9458,14 +8942,8 @@ float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
@@ -9477,10 +8955,8 @@ int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9492,14 +8968,8 @@ int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
@@ -9511,10 +8981,8 @@ uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
@@ -9526,10 +8994,8 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 // CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x float> [[ADD_I]]
@@ -9706,8 +9172,6 @@ int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -9719,8 +9183,6 @@ int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -9743,8 +9205,6 @@ uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -9756,8 +9216,6 @@ uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -9769,11 +9227,7 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -9785,11 +9239,7 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -9801,11 +9251,7 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -9817,11 +9263,7 @@ uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB]]
@@ -9833,12 +9275,8 @@ uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -9850,10 +9288,8 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -9865,12 +9301,8 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -9882,10 +9314,8 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -9897,9 +9327,7 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -9911,9 +9339,7 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -9925,9 +9351,7 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB]]
@@ -9939,9 +9363,7 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB]]
@@ -9953,9 +9375,7 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <2 x float> [[SUB]]
@@ -9967,9 +9387,7 @@ float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -9981,9 +9399,7 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -9995,9 +9411,7 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB]]
@@ -10009,9 +9423,7 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB]]
@@ -10023,9 +9435,7 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
 // CHECK-NEXT:    ret <4 x float> [[SUB]]
@@ -10037,10 +9447,8 @@ float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
@@ -10052,8 +9460,8 @@ int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
@@ -10065,10 +9473,8 @@ int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i16> [[SUB_I]]
@@ -10080,8 +9486,8 @@ uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
 // CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x i32> [[SUB_I]]
@@ -10093,8 +9499,8 @@ uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
 // CHECK-LABEL: define <2 x float> @test_vmls_n_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <2 x float> [[SUB_I]]
@@ -10106,14 +9512,8 @@ float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
@@ -10125,10 +9525,8 @@ int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -10140,14 +9538,8 @@ int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
 // CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
@@ -10159,10 +9551,8 @@ uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -10174,10 +9564,8 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
 // CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]]
 // CHECK-NEXT:    ret <4 x float> [[SUB_I]]
@@ -10199,7 +9587,6 @@ int16x8_t test_vmovl_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vmovl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 //
@@ -10210,7 +9597,6 @@ int32x4_t test_vmovl_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vmovl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
 //
@@ -10231,7 +9617,6 @@ uint16x8_t test_vmovl_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vmovl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 //
@@ -10242,7 +9627,6 @@ uint32x4_t test_vmovl_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vmovl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I]]
 //
@@ -10253,7 +9637,6 @@ uint64x2_t test_vmovl_u32(uint32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vmovn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
 //
@@ -10264,7 +9647,6 @@ int8x8_t test_vmovn_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vmovn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
 //
@@ -10275,7 +9657,6 @@ int16x4_t test_vmovn_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vmovn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
 //
@@ -10286,7 +9667,6 @@ int32x2_t test_vmovn_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vmovn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VMOVN_I]]
 //
@@ -10297,7 +9677,6 @@ uint8x8_t test_vmovn_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vmovn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VMOVN_I]]
 //
@@ -10308,7 +9687,6 @@ uint16x4_t test_vmovn_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vmovn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VMOVN_I]]
 //
@@ -10319,14 +9697,8 @@ uint32x2_t test_vmovn_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 uint8x8_t test_vmov_n_u8(uint8_t a) {
@@ -10336,10 +9708,8 @@ uint8x8_t test_vmov_n_u8(uint8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 uint16x4_t test_vmov_n_u16(uint16_t a) {
@@ -10349,8 +9719,8 @@ uint16x4_t test_vmov_n_u16(uint16_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
 //
 uint32x2_t test_vmov_n_u32(uint32_t a) {
@@ -10360,14 +9730,8 @@ uint32x2_t test_vmov_n_u32(uint32_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 int8x8_t test_vmov_n_s8(int8_t a) {
@@ -10377,10 +9741,8 @@ int8x8_t test_vmov_n_s8(int8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 int16x4_t test_vmov_n_s16(int16_t a) {
@@ -10390,8 +9752,8 @@ int16x4_t test_vmov_n_s16(int16_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i32> [[VECINIT1_I]]
 //
 int32x2_t test_vmov_n_s32(int32_t a) {
@@ -10401,14 +9763,8 @@ int32x2_t test_vmov_n_s32(int32_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i8> [[VECINIT7_I]]
 //
 poly8x8_t test_vmov_n_p8(poly8_t a) {
@@ -10418,10 +9774,8 @@ poly8x8_t test_vmov_n_p8(poly8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i16> [[VECINIT3_I]]
 //
 poly16x4_t test_vmov_n_p16(poly16_t a) {
@@ -10432,10 +9786,8 @@ poly16x4_t test_vmov_n_p16(poly16_t a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
 //
 float16x4_t test_vmov_n_f16(float16_t *a) {
@@ -10445,8 +9797,8 @@ float16x4_t test_vmov_n_f16(float16_t *a) {
 // CHECK-LABEL: define <2 x float> @test_vmov_n_f32(
 // CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x float> [[VECINIT1_I]]
 //
 float32x2_t test_vmov_n_f32(float32_t a) {
@@ -10456,22 +9808,8 @@ float32x2_t test_vmov_n_f32(float32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
@@ -10481,14 +9819,8 @@ uint8x16_t test_vmovq_n_u8(uint8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
@@ -10498,10 +9830,8 @@ uint16x8_t test_vmovq_n_u16(uint16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
@@ -10511,22 +9841,8 @@ uint32x4_t test_vmovq_n_u32(uint32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 int8x16_t test_vmovq_n_s8(int8_t a) {
@@ -10536,14 +9852,8 @@ int8x16_t test_vmovq_n_s8(int8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 int16x8_t test_vmovq_n_s16(int16_t a) {
@@ -10553,10 +9863,8 @@ int16x8_t test_vmovq_n_s16(int16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
 //
 int32x4_t test_vmovq_n_s32(int32_t a) {
@@ -10566,22 +9874,8 @@ int32x4_t test_vmovq_n_s32(int32_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7
-// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8
-// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9
-// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10
-// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11
-// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12
-// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13
-// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14
-// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    ret <16 x i8> [[VECINIT15_I]]
 //
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
@@ -10591,14 +9885,8 @@ poly8x16_t test_vmovq_n_p8(poly8_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x i16> [[VECINIT7_I]]
 //
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
@@ -10609,14 +9897,8 @@ poly16x8_t test_vmovq_n_p16(poly16_t a) {
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0
-// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
-// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
-// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
-// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
-// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
-// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
-// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
 //
 float16x8_t test_vmovq_n_f16(float16_t *a) {
@@ -10626,10 +9908,8 @@ float16x8_t test_vmovq_n_f16(float16_t *a) {
 // CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(
 // CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x float> [[VECINIT3_I]]
 //
 float32x4_t test_vmovq_n_f32(float32_t a) {
@@ -10639,8 +9919,8 @@ float32x4_t test_vmovq_n_f32(float32_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 int64x1_t test_vmov_n_s64(int64_t a) {
@@ -10651,8 +9931,8 @@ int64x1_t test_vmov_n_s64(int64_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[ADD_I]]
 //
 uint64x1_t test_vmov_n_u64(uint64_t a) {
@@ -10663,8 +9943,8 @@ uint64x1_t test_vmov_n_u64(uint64_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
 //
 int64x2_t test_vmovq_n_s64(int64_t a) {
@@ -10674,8 +9954,8 @@ int64x2_t test_vmovq_n_s64(int64_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    ret <2 x i64> [[VECINIT1_I]]
 //
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
@@ -10835,8 +10115,6 @@ int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -10847,8 +10125,6 @@ int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -10869,8 +10145,6 @@ uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -10881,8 +10155,6 @@ uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -10903,11 +10175,7 @@ poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -10918,11 +10186,7 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -10933,11 +10197,7 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
 //
@@ -10948,11 +10208,7 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
 //
@@ -10963,12 +10219,8 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
@@ -10979,10 +10231,8 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
@@ -10993,12 +10243,8 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
 //
@@ -11009,10 +10255,8 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
 //
@@ -11043,9 +10287,7 @@ poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -11056,9 +10298,7 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -11069,9 +10309,7 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <2 x float> [[MUL]]
 //
@@ -11082,9 +10320,7 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL]]
 //
@@ -11095,9 +10331,7 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL]]
 //
@@ -11108,9 +10342,7 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -11121,9 +10353,7 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -11134,9 +10364,7 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x float> [[MUL]]
 //
@@ -11147,9 +10375,7 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL]]
 //
@@ -11160,9 +10386,7 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL]]
 //
@@ -11173,10 +10397,8 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
 //
@@ -11187,8 +10409,8 @@ int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
 //
@@ -11199,8 +10421,8 @@ int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
 // CHECK-LABEL: define <2 x float> @test_vmul_n_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x float> [[MUL_I]]
 //
@@ -11211,10 +10433,8 @@ float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i16> [[MUL_I]]
 //
@@ -11225,8 +10445,8 @@ uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]]
 // CHECK-NEXT:    ret <2 x i32> [[MUL_I]]
 //
@@ -11237,14 +10457,8 @@ uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
 //
@@ -11255,10 +10469,8 @@ int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
 //
@@ -11269,10 +10481,8 @@ int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
 // CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x float> [[MUL_I]]
 //
@@ -11283,14 +10493,8 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
 // CHECK-NEXT:    ret <8 x i16> [[MUL_I]]
 //
@@ -11301,10 +10505,8 @@ uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
 // CHECK-NEXT:    ret <4 x i32> [[MUL_I]]
 //
@@ -11871,7 +11073,6 @@ uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpadal_s8(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VPADAL_V1_I]]
 //
@@ -11882,8 +11083,6 @@ int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpadal_s16(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]])
 // CHECK-NEXT:    ret <2 x i32> [[VPADAL_V2_I]]
 //
@@ -11894,8 +11093,6 @@ int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vpadal_s32(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]])
 // CHECK-NEXT:    ret <1 x i64> [[VPADAL_V2_I]]
 //
@@ -11906,7 +11103,6 @@ int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpadal_u8(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VPADAL_V1_I]]
 //
@@ -11917,8 +11113,6 @@ uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpadal_u16(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]])
 // CHECK-NEXT:    ret <2 x i32> [[VPADAL_V2_I]]
 //
@@ -11929,8 +11123,6 @@ uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vpadal_u32(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]])
 // CHECK-NEXT:    ret <1 x i64> [[VPADAL_V2_I]]
 //
@@ -11941,7 +11133,6 @@ uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VPADALQ_V1_I]]
 //
@@ -11952,8 +11143,6 @@ int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VPADALQ_V2_I]]
 //
@@ -11964,8 +11153,6 @@ int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]])
 // CHECK-NEXT:    ret <2 x i64> [[VPADALQ_V2_I]]
 //
@@ -11976,7 +11163,6 @@ int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VPADALQ_V1_I]]
 //
@@ -11987,8 +11173,6 @@ uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <4 x i32> [[VPADALQ_V2_I]]
 //
@@ -11999,8 +11183,6 @@ uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]])
 // CHECK-NEXT:    ret <2 x i64> [[VPADALQ_V2_I]]
 //
@@ -12021,10 +11203,7 @@ int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpadd_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
 //
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
@@ -12034,10 +11213,7 @@ int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpadd_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
 //
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
@@ -12057,10 +11233,7 @@ uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpadd_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPADD_V2_I]]
 //
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
@@ -12070,10 +11243,7 @@ uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpadd_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPADD_V2_I]]
 //
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
@@ -12083,10 +11253,7 @@ uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vpadd_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VPADD_V2_I]]
 //
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
@@ -12106,7 +11273,6 @@ int16x4_t test_vpaddl_s8(int8x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
 //
@@ -12117,7 +11283,6 @@ int32x2_t test_vpaddl_s16(int16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
 //
@@ -12138,7 +11303,6 @@ uint16x4_t test_vpaddl_u8(uint8x8_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VPADDL1_I]]
 //
@@ -12149,7 +11313,6 @@ uint32x2_t test_vpaddl_u16(uint16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret <1 x i64> [[VPADDL1_I]]
 //
@@ -12170,7 +11333,6 @@ int16x8_t test_vpaddlq_s8(int8x16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
 //
@@ -12181,7 +11343,6 @@ int32x4_t test_vpaddlq_s16(int16x8_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
 //
@@ -12202,7 +11363,6 @@ uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VPADDL1_I]]
 //
@@ -12213,7 +11373,6 @@ uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret <2 x i64> [[VPADDL1_I]]
 //
@@ -12234,10 +11393,7 @@ int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpmax_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPMAX_V2_I]]
 //
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
@@ -12247,10 +11403,7 @@ int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpmax_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPMAX_V2_I]]
 //
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
@@ -12270,10 +11423,7 @@ uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpmax_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPMAX_V2_I]]
 //
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
@@ -12283,10 +11433,7 @@ uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpmax_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPMAX_V2_I]]
 //
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
@@ -12296,10 +11443,7 @@ uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vpmax_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VPMAX_V2_I]]
 //
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
@@ -12319,10 +11463,7 @@ int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpmin_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPMIN_V2_I]]
 //
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
@@ -12332,10 +11473,7 @@ int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpmin_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPMIN_V2_I]]
 //
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
@@ -12355,10 +11493,7 @@ uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vpmin_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VPMIN_V2_I]]
 //
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
@@ -12368,10 +11503,7 @@ uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vpmin_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VPMIN_V2_I]]
 //
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
@@ -12381,10 +11513,7 @@ uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vpmin_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VPMIN_V2_I]]
 //
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
@@ -12404,9 +11533,7 @@ int8x8_t test_vqabs_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqabs_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQABS_V1_I]]
 //
 int16x4_t test_vqabs_s16(int16x4_t a) {
@@ -12416,9 +11543,7 @@ int16x4_t test_vqabs_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqabs_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[A]])
-// CHECK-NEXT:    [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQABS_V1_I]]
 //
 int32x2_t test_vqabs_s32(int32x2_t a) {
@@ -12438,9 +11563,7 @@ int8x16_t test_vqabsq_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQABSQ_V1_I]]
 //
 int16x8_t test_vqabsq_s16(int16x8_t a) {
@@ -12450,9 +11573,7 @@ int16x8_t test_vqabsq_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQABSQ_V1_I]]
 //
 int32x4_t test_vqabsq_s32(int32x4_t a) {
@@ -12472,10 +11593,7 @@ int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqadd_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
 //
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
@@ -12485,10 +11603,7 @@ int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqadd_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
 //
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
@@ -12498,10 +11613,7 @@ int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqadd_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
 //
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
@@ -12521,10 +11633,7 @@ uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqadd_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQADD_V2_I]]
 //
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
@@ -12534,10 +11643,7 @@ uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqadd_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQADD_V2_I]]
 //
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
@@ -12547,10 +11653,7 @@ uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqadd_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQADD_V2_I]]
 //
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
@@ -12570,10 +11673,7 @@ int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
 //
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
@@ -12583,10 +11683,7 @@ int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
 //
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
@@ -12596,10 +11693,7 @@ int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
 //
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
@@ -12619,10 +11713,7 @@ uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQADDQ_V2_I]]
 //
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
@@ -12632,10 +11723,7 @@ uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQADDQ_V2_I]]
 //
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
@@ -12645,10 +11733,7 @@ uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQADDQ_V2_I]]
 //
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
@@ -12658,9 +11743,6 @@ uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
@@ -12672,9 +11754,6 @@ int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
@@ -12686,12 +11765,7 @@ int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I]]
@@ -12703,12 +11777,7 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I]]
@@ -12720,13 +11789,8 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLAL_V3_I_I]]
@@ -12738,11 +11802,8 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLAL_V3_I_I]]
@@ -12754,9 +11815,6 @@ int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
@@ -12768,9 +11826,6 @@ int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
@@ -12782,12 +11837,7 @@ int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I]]
@@ -12799,12 +11849,7 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I]]
@@ -12816,13 +11861,8 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <4 x i32> [[VQDMLSL_V3_I_I]]
@@ -12834,11 +11874,8 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
 // CHECK-NEXT:    [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
 // CHECK-NEXT:    ret <2 x i64> [[VQDMLSL_V3_I_I]]
@@ -12850,10 +11887,7 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
 //
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
@@ -12863,10 +11897,7 @@ int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
 //
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
@@ -12876,10 +11907,7 @@ int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
 //
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
@@ -12889,10 +11917,7 @@ int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
 //
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
@@ -12902,13 +11927,8 @@ int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
 //
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
@@ -12918,13 +11938,8 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
 //
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
@@ -12934,13 +11949,8 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
 //
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
@@ -12950,13 +11960,8 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
 //
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
@@ -12966,14 +11971,9 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I_I]]
 //
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
@@ -12983,12 +11983,9 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I_I]]
 //
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
@@ -12998,18 +11995,9 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I_I]]
 //
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
@@ -13019,14 +12007,9 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I_I]]
 //
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
@@ -13036,10 +12019,7 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
@@ -13049,10 +12029,7 @@ int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
@@ -13062,13 +12039,8 @@ int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I]]
 //
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
@@ -13078,13 +12050,8 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I]]
 //
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
@@ -13094,14 +12061,9 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQDMULL_V2_I_I]]
 //
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
@@ -13111,12 +12073,9 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQDMULL_V2_I_I]]
 //
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
@@ -13126,7 +12085,6 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
 //
@@ -13137,9 +12095,7 @@ int8x8_t test_vqmovn_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
 //
 int16x4_t test_vqmovn_s32(int32x4_t a) {
@@ -13149,9 +12105,7 @@ int16x4_t test_vqmovn_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[A]])
-// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
 //
 int32x2_t test_vqmovn_s64(int64x2_t a) {
@@ -13161,7 +12115,6 @@ int32x2_t test_vqmovn_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VQMOVN_V1_I]]
 //
@@ -13172,9 +12125,7 @@ uint8x8_t test_vqmovn_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQMOVN_V1_I]]
 //
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
@@ -13184,9 +12135,7 @@ uint16x4_t test_vqmovn_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[A]])
-// CHECK-NEXT:    [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQMOVN_V1_I]]
 //
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
@@ -13196,7 +12145,6 @@ uint32x2_t test_vqmovn_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[A]])
 // CHECK-NEXT:    ret <8 x i8> [[VQMOVUN_V1_I]]
 //
@@ -13207,9 +12155,7 @@ uint8x8_t test_vqmovun_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQMOVUN_V1_I]]
 //
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
@@ -13219,9 +12165,7 @@ uint16x4_t test_vqmovun_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[A]])
-// CHECK-NEXT:    [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQMOVUN_V1_I]]
 //
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
@@ -13241,9 +12185,7 @@ int8x8_t test_vqneg_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqneg_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[A]])
-// CHECK-NEXT:    [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQNEG_V1_I]]
 //
 int16x4_t test_vqneg_s16(int16x4_t a) {
@@ -13253,9 +12195,7 @@ int16x4_t test_vqneg_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqneg_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[A]])
-// CHECK-NEXT:    [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQNEG_V1_I]]
 //
 int32x2_t test_vqneg_s32(int32x2_t a) {
@@ -13275,9 +12215,7 @@ int8x16_t test_vqnegq_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[A]])
-// CHECK-NEXT:    [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQNEGQ_V1_I]]
 //
 int16x8_t test_vqnegq_s16(int16x8_t a) {
@@ -13287,9 +12225,7 @@ int16x8_t test_vqnegq_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[A]])
-// CHECK-NEXT:    [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQNEGQ_V1_I]]
 //
 int32x4_t test_vqnegq_s32(int32x4_t a) {
@@ -13299,10 +12235,7 @@ int32x4_t test_vqnegq_s32(int32x4_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
 //
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
@@ -13312,10 +12245,7 @@ int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
 //
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
@@ -13325,10 +12255,7 @@ int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
 //
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
@@ -13338,10 +12265,7 @@ int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
 //
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
@@ -13351,13 +12275,8 @@ int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
 //
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
@@ -13367,13 +12286,8 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
 //
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
@@ -13383,13 +12297,8 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
 //
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
@@ -13399,13 +12308,8 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
 //
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
@@ -13415,14 +12319,9 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I_I]]
 //
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
@@ -13432,12 +12331,9 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
-// CHECK-NEXT:    [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I_I]]
 //
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
@@ -13447,18 +12343,9 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
-// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
-// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
-// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
-// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I_I]]
 //
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
@@ -13468,14 +12355,9 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
-// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
-// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I_I]]
 //
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
@@ -13495,10 +12377,7 @@ int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
 //
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
@@ -13508,10 +12387,7 @@ int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
 //
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
@@ -13521,10 +12397,7 @@ int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
 //
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
@@ -13544,10 +12417,7 @@ uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQRSHL_V2_I]]
 //
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
@@ -13557,10 +12427,7 @@ uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQRSHL_V2_I]]
 //
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
@@ -13570,10 +12437,7 @@ uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQRSHL_V2_I]]
 //
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
@@ -13593,10 +12457,7 @@ int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
 //
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
@@ -13606,10 +12467,7 @@ int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
 //
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
@@ -13619,10 +12477,7 @@ int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
 //
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
@@ -13642,10 +12497,7 @@ uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQRSHLQ_V2_I]]
 //
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
@@ -13655,10 +12507,7 @@ uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQRSHLQ_V2_I]]
 //
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
@@ -13668,10 +12517,7 @@ uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQRSHLQ_V2_I]]
 //
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
@@ -13681,9 +12527,7 @@ uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
 //
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
@@ -13693,9 +12537,7 @@ int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
 //
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
@@ -13705,9 +12547,7 @@ int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
 //
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
@@ -13717,9 +12557,7 @@ int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQRSHRN_N1]]
 //
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
@@ -13729,9 +12567,7 @@ uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQRSHRN_N1]]
 //
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
@@ -13741,9 +12577,7 @@ uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQRSHRN_N1]]
 //
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
@@ -13753,9 +12587,7 @@ uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQRSHRUN_N1]]
 //
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
@@ -13765,9 +12597,7 @@ uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQRSHRUN_N1]]
 //
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
@@ -13777,9 +12607,7 @@ uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQRSHRUN_N1]]
 //
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
@@ -13799,10 +12627,7 @@ int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqshl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
 //
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
@@ -13812,10 +12637,7 @@ int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqshl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
 //
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
@@ -13825,10 +12647,7 @@ int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqshl_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
 //
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
@@ -13848,10 +12667,7 @@ uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqshl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQSHL_V2_I]]
 //
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
@@ -13861,10 +12677,7 @@ uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqshl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQSHL_V2_I]]
 //
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
@@ -13874,10 +12687,7 @@ uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqshl_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQSHL_V2_I]]
 //
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
@@ -13897,10 +12707,7 @@ int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
 //
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
@@ -13910,10 +12717,7 @@ int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
 //
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
@@ -13923,10 +12727,7 @@ int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
 //
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
@@ -13946,10 +12747,7 @@ uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQSHLQ_V2_I]]
 //
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
@@ -13959,10 +12757,7 @@ uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQSHLQ_V2_I]]
 //
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
@@ -13972,10 +12767,7 @@ uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQSHLQ_V2_I]]
 //
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
@@ -13995,9 +12787,7 @@ uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHLU_N1]]
 //
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
@@ -14007,9 +12797,7 @@ uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHLU_N1]]
 //
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
@@ -14019,9 +12807,7 @@ uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <1 x i64> [[VQSHLU_N1]]
 //
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
@@ -14041,9 +12827,7 @@ uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VQSHLU_N1]]
 //
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
@@ -14053,9 +12837,7 @@ uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <4 x i32> [[VQSHLU_N1]]
 //
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
@@ -14065,9 +12847,7 @@ uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <2 x i64> [[VQSHLU_N1]]
 //
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
@@ -14087,9 +12867,7 @@ int8x8_t test_vqshl_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
 //
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
@@ -14099,9 +12877,7 @@ int16x4_t test_vqshl_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
 //
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
@@ -14111,9 +12887,7 @@ int32x2_t test_vqshl_n_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
 //
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
@@ -14133,9 +12907,7 @@ uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHL_N1]]
 //
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
@@ -14145,9 +12917,7 @@ uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHL_N1]]
 //
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
@@ -14157,9 +12927,7 @@ uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <1 x i64> [[VQSHL_N1]]
 //
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
@@ -14179,9 +12947,7 @@ int8x16_t test_vqshlq_n_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
 //
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
@@ -14191,9 +12957,7 @@ int16x8_t test_vqshlq_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
 //
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
@@ -14203,9 +12967,7 @@ int32x4_t test_vqshlq_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
 //
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
@@ -14225,9 +12987,7 @@ uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VQSHL_N1]]
 //
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
@@ -14237,9 +12997,7 @@ uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <4 x i32> [[VQSHL_N1]]
 //
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
@@ -14249,9 +13007,7 @@ uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <2 x i64> [[VQSHL_N1]]
 //
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
@@ -14261,9 +13017,7 @@ uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
 //
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
@@ -14273,9 +13027,7 @@ int8x8_t test_vqshrn_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
 //
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
@@ -14285,9 +13037,7 @@ int16x4_t test_vqshrn_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
 //
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
@@ -14297,9 +13047,7 @@ int32x2_t test_vqshrn_n_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQSHRN_N1]]
 //
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
@@ -14309,9 +13057,7 @@ uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHRN_N1]]
 //
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
@@ -14321,9 +13067,7 @@ uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHRN_N1]]
 //
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
@@ -14333,9 +13077,7 @@ uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VQSHRUN_N1]]
 //
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
@@ -14345,9 +13087,7 @@ uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VQSHRUN_N1]]
 //
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
@@ -14357,9 +13097,7 @@ uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VQSHRUN_N1]]
 //
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
@@ -14379,10 +13117,7 @@ int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqsub_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
 //
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
@@ -14392,10 +13127,7 @@ int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqsub_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
 //
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
@@ -14405,10 +13137,7 @@ int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqsub_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
 //
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
@@ -14428,10 +13157,7 @@ uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vqsub_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VQSUB_V2_I]]
 //
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
@@ -14441,10 +13167,7 @@ uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vqsub_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VQSUB_V2_I]]
 //
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
@@ -14454,10 +13177,7 @@ uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vqsub_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VQSUB_V2_I]]
 //
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
@@ -14477,10 +13197,7 @@ int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
 //
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
@@ -14490,10 +13207,7 @@ int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
 //
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
@@ -14503,10 +13217,7 @@ int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
 //
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
@@ -14526,10 +13237,7 @@ uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VQSUBQ_V2_I]]
 //
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
@@ -14539,10 +13247,7 @@ uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VQSUBQ_V2_I]]
 //
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
@@ -14552,10 +13257,7 @@ uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VQSUBQ_V2_I]]
 //
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
@@ -14565,8 +13267,6 @@ uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
 //
@@ -14577,10 +13277,7 @@ int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
 //
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
@@ -14590,10 +13287,7 @@ int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
 //
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
@@ -14603,8 +13297,6 @@ int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VRADDHN_V2_I]]
 //
@@ -14615,10 +13307,7 @@ uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRADDHN_V2_I]]
 //
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
@@ -14628,10 +13317,7 @@ uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRADDHN_V2_I]]
 //
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
@@ -14641,7 +13327,6 @@ uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <2 x float> @test_vrecpe_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x float> [[VRECPE_V1_I]]
 //
@@ -14652,7 +13337,6 @@ float32x2_t test_vrecpe_f32(float32x2_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VRECPE_V1_I]]
 //
@@ -14663,7 +13347,6 @@ uint32x2_t test_vrecpe_u32(uint32x2_t a) {
 // CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x float> [[VRECPEQ_V1_I]]
 //
@@ -14674,7 +13357,6 @@ float32x4_t test_vrecpeq_f32(float32x4_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VRECPEQ_V1_I]]
 //
@@ -14685,10 +13367,7 @@ uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vrecps_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VRECPS_V2_I]]
 //
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
@@ -14698,10 +13377,7 @@ float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VRECPSQ_V2_I]]
 //
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
@@ -17319,7 +15995,7 @@ poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 int8x8_t test_vrev16_s8(int8x8_t a) {
@@ -17329,7 +16005,7 @@ int8x8_t test_vrev16_s8(int8x8_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
@@ -17339,7 +16015,7 @@ uint8x8_t test_vrev16_u8(uint8x8_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
@@ -17349,7 +16025,7 @@ poly8x8_t test_vrev16_p8(poly8x8_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 int8x16_t test_vrev16q_s8(int8x16_t a) {
@@ -17359,7 +16035,7 @@ int8x16_t test_vrev16q_s8(int8x16_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
@@ -17369,7 +16045,7 @@ uint8x16_t test_vrev16q_u8(uint8x16_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
@@ -17379,7 +16055,7 @@ poly8x16_t test_vrev16q_p8(poly8x16_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev32_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 int8x8_t test_vrev32_s8(int8x8_t a) {
@@ -17389,7 +16065,7 @@ int8x8_t test_vrev32_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev32_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 int16x4_t test_vrev32_s16(int16x4_t a) {
@@ -17399,7 +16075,7 @@ int16x4_t test_vrev32_s16(int16x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev32_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
@@ -17409,7 +16085,7 @@ uint8x8_t test_vrev32_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev32_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
@@ -17419,7 +16095,7 @@ uint16x4_t test_vrev32_u16(uint16x4_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev32_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
@@ -17429,7 +16105,7 @@ poly8x8_t test_vrev32_p8(poly8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev32_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
@@ -17439,7 +16115,7 @@ poly16x4_t test_vrev32_p16(poly16x4_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 int8x16_t test_vrev32q_s8(int8x16_t a) {
@@ -17449,7 +16125,7 @@ int8x16_t test_vrev32q_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 int16x8_t test_vrev32q_s16(int16x8_t a) {
@@ -17459,7 +16135,7 @@ int16x8_t test_vrev32q_s16(int16x8_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
@@ -17469,7 +16145,7 @@ uint8x16_t test_vrev32q_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
@@ -17479,7 +16155,7 @@ uint16x8_t test_vrev32q_u16(uint16x8_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
@@ -17489,7 +16165,7 @@ poly8x16_t test_vrev32q_p8(poly8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
@@ -17499,7 +16175,7 @@ poly16x8_t test_vrev32q_p16(poly16x8_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 int8x8_t test_vrev64_s8(int8x8_t a) {
@@ -17509,7 +16185,7 @@ int8x8_t test_vrev64_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 int16x4_t test_vrev64_s16(int16x4_t a) {
@@ -17519,7 +16195,7 @@ int16x4_t test_vrev64_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrev64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 int32x2_t test_vrev64_s32(int32x2_t a) {
@@ -17529,7 +16205,7 @@ int32x2_t test_vrev64_s32(int32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
@@ -17539,7 +16215,7 @@ uint8x8_t test_vrev64_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
@@ -17549,7 +16225,7 @@ uint16x4_t test_vrev64_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrev64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 // CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
 //
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
@@ -17559,7 +16235,7 @@ uint32x2_t test_vrev64_u32(uint32x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrev64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
 //
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
@@ -17569,7 +16245,7 @@ poly8x8_t test_vrev64_p8(poly8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrev64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
 //
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
@@ -17579,7 +16255,7 @@ poly16x4_t test_vrev64_p16(poly16x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vrev64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[A]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 // CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
 //
 float32x2_t test_vrev64_f32(float32x2_t a) {
@@ -17589,7 +16265,7 @@ float32x2_t test_vrev64_f32(float32x2_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 int8x16_t test_vrev64q_s8(int8x16_t a) {
@@ -17599,7 +16275,7 @@ int8x16_t test_vrev64q_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 int16x8_t test_vrev64q_s16(int16x8_t a) {
@@ -17609,7 +16285,7 @@ int16x8_t test_vrev64q_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
 //
 int32x4_t test_vrev64q_s32(int32x4_t a) {
@@ -17619,7 +16295,7 @@ int32x4_t test_vrev64q_s32(int32x4_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
@@ -17629,7 +16305,7 @@ uint8x16_t test_vrev64q_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
@@ -17639,7 +16315,7 @@ uint16x8_t test_vrev64q_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
 //
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
@@ -17649,7 +16325,7 @@ uint32x4_t test_vrev64q_u32(uint32x4_t a) {
 // CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
 //
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
@@ -17659,7 +16335,7 @@ poly8x16_t test_vrev64q_p8(poly8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 // CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
 //
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
@@ -17669,7 +16345,7 @@ poly16x8_t test_vrev64q_p16(poly16x8_t a) {
 // CHECK-LABEL: define <4 x float> @test_vrev64q_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 // CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
 //
 float32x4_t test_vrev64q_f32(float32x4_t a) {
@@ -17689,10 +16365,7 @@ int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
 //
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
@@ -17702,10 +16375,7 @@ int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
 //
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
@@ -17725,10 +16395,7 @@ uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRHADD_V2_I]]
 //
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
@@ -17738,10 +16405,7 @@ uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRHADD_V2_I]]
 //
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
@@ -17761,10 +16425,7 @@ int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
 //
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
@@ -17774,10 +16435,7 @@ int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
 //
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
@@ -17797,10 +16455,7 @@ uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VRHADDQ_V2_I]]
 //
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
@@ -17810,10 +16465,7 @@ uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VRHADDQ_V2_I]]
 //
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
@@ -17833,10 +16485,7 @@ int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrshl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
 //
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
@@ -17846,10 +16495,7 @@ int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrshl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
 //
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
@@ -17859,10 +16505,7 @@ int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vrshl_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
 //
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
@@ -17882,10 +16525,7 @@ uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrshl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRSHL_V2_I]]
 //
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
@@ -17895,10 +16535,7 @@ uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrshl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRSHL_V2_I]]
 //
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
@@ -17908,10 +16545,7 @@ uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vrshl_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VRSHL_V2_I]]
 //
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
@@ -17931,10 +16565,7 @@ int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
 //
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
@@ -17944,10 +16575,7 @@ int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
 //
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
@@ -17957,10 +16585,7 @@ int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
 //
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
@@ -17980,10 +16605,7 @@ uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VRSHLQ_V2_I]]
 //
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
@@ -17993,10 +16615,7 @@ uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VRSHLQ_V2_I]]
 //
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
@@ -18006,10 +16625,7 @@ uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VRSHLQ_V2_I]]
 //
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
@@ -18019,9 +16635,7 @@ uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
 //
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
@@ -18031,9 +16645,7 @@ int8x8_t test_vrshrn_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
 //
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
@@ -18043,9 +16655,7 @@ int16x4_t test_vrshrn_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
 //
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
@@ -18055,9 +16665,7 @@ int32x2_t test_vrshrn_n_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i8> [[VRSHRN_N1]]
 //
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
@@ -18067,9 +16675,7 @@ uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
 //
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
@@ -18079,9 +16685,7 @@ uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VRSHRN_N1]]
 //
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
@@ -18101,9 +16705,7 @@ int8x8_t test_vrshr_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
 //
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
@@ -18113,9 +16715,7 @@ int16x4_t test_vrshr_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
 //
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
@@ -18125,9 +16725,7 @@ int32x2_t test_vrshr_n_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
 //
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
@@ -18147,9 +16745,7 @@ uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VRSHR_N1]]
 //
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
@@ -18159,9 +16755,7 @@ uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VRSHR_N1]]
 //
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
@@ -18171,9 +16765,7 @@ uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <1 x i64> [[VRSHR_N1]]
 //
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
@@ -18193,9 +16785,7 @@ int8x16_t test_vrshrq_n_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
 //
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
@@ -18205,9 +16795,7 @@ int16x8_t test_vrshrq_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
 //
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
@@ -18217,9 +16805,7 @@ int32x4_t test_vrshrq_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
 //
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
@@ -18239,9 +16825,7 @@ uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i16> [[VRSHR_N1]]
 //
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
@@ -18251,9 +16835,7 @@ uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i32> [[VRSHR_N1]]
 //
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
@@ -18263,9 +16845,7 @@ uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i64> [[VRSHR_N1]]
 //
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
@@ -18275,7 +16855,6 @@ uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
 // CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x float> [[VRSQRTE_V1_I]]
 //
@@ -18286,7 +16865,6 @@ float32x2_t test_vrsqrte_f32(float32x2_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VRSQRTE_V1_I]]
 //
@@ -18297,7 +16875,6 @@ uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
 // CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x float> [[VRSQRTEQ_V1_I]]
 //
@@ -18308,7 +16885,6 @@ float32x4_t test_vrsqrteq_f32(float32x4_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VRSQRTEQ_V1_I]]
 //
@@ -18319,10 +16895,7 @@ uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[A]], <2 x float> [[B]])
-// CHECK-NEXT:    [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VRSQRTS_V2_I]]
 //
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
@@ -18332,10 +16905,7 @@ float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[A]], <4 x float> [[B]])
-// CHECK-NEXT:    [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VRSQRTSQ_V2_I]]
 //
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
@@ -18356,12 +16926,8 @@ int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <4 x i16> [[VRSRA_N]]
 //
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
@@ -18371,12 +16937,8 @@ int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <2 x i32> [[VRSRA_N]]
 //
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
@@ -18386,12 +16948,8 @@ int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <1 x i64> [[VRSRA_N]]
 //
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
@@ -18412,12 +16970,8 @@ uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i16> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <4 x i16> [[VRSRA_N]]
 //
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
@@ -18427,12 +16981,8 @@ uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i32> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <2 x i32> [[VRSRA_N]]
 //
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
@@ -18442,12 +16992,8 @@ uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <1 x i64> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <1 x i64> [[VRSRA_N]]
 //
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
@@ -18468,12 +17014,8 @@ int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <8 x i16> [[VRSRA_N]]
 //
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
@@ -18483,12 +17025,8 @@ int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <4 x i32> [[VRSRA_N]]
 //
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
@@ -18498,12 +17036,8 @@ int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <2 x i64> [[VRSRA_N]]
 //
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
@@ -18524,12 +17058,8 @@ uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <8 x i16> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <8 x i16> [[VRSRA_N]]
 //
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
@@ -18539,12 +17069,8 @@ uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <4 x i32> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <4 x i32> [[VRSRA_N]]
 //
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
@@ -18554,12 +17080,8 @@ uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1))
-// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VRSRA_N:%.*]] = add <2 x i64> [[A]], [[TMP0]]
 // CHECK-NEXT:    ret <2 x i64> [[VRSRA_N]]
 //
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
@@ -18569,8 +17091,6 @@ uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
 //
@@ -18581,10 +17101,7 @@ int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
 //
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
@@ -18594,10 +17111,7 @@ int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
 //
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
@@ -18607,8 +17121,6 @@ int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VRSUBHN_V2_I]]
 //
@@ -18619,10 +17131,7 @@ uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VRSUBHN_V2_I]]
 //
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
@@ -18632,10 +17141,7 @@ uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VRSUBHN_V2_I]]
 //
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
@@ -18645,7 +17151,7 @@ uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
 //
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
@@ -18655,7 +17161,7 @@ uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
 //
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
@@ -18665,7 +17171,7 @@ uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i64 1
 // CHECK-NEXT:    ret <2 x i32> [[VSET_LANE]]
 //
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
@@ -18675,7 +17181,7 @@ uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
 //
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
@@ -18685,7 +17191,7 @@ int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
 //
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
@@ -18695,7 +17201,7 @@ int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i64 1
 // CHECK-NEXT:    ret <2 x i32> [[VSET_LANE]]
 //
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
@@ -18705,7 +17211,7 @@ int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[VSET_LANE]]
 //
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
@@ -18715,7 +17221,7 @@ poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3
 // CHECK-NEXT:    ret <4 x i16> [[VSET_LANE]]
 //
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
@@ -18725,7 +17231,7 @@ poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
 // CHECK-LABEL: define <2 x float> @test_vset_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x float> [[B]], float [[A]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x float> [[B]], float [[A]], i64 1
 // CHECK-NEXT:    ret <2 x float> [[VSET_LANE]]
 //
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
@@ -18735,18 +17241,11 @@ float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
 // CHECK-LABEL: define <4 x half> @test_vset_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__REINT_853:%.*]] = alloca half, align 2
-// CHECK-NEXT:    [[__REINT1_853:%.*]] = alloca <4 x half>, align 8
-// CHECK-NEXT:    [[__REINT2_853:%.*]] = alloca <4 x i16>, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    store half [[TMP0]], ptr [[__REINT_853]], align 2
-// CHECK-NEXT:    store <4 x half> [[B]], ptr [[__REINT1_853]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__REINT_853]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT1_853]], align 8
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP1]], i32 1
-// CHECK-NEXT:    store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_853]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr [[__REINT2_853]], align 8
-// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP0]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VSET_LANE]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP2]]
 //
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
@@ -18755,7 +17254,7 @@ float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
 //
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
@@ -18765,7 +17264,7 @@ uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
 //
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
@@ -18775,7 +17274,7 @@ uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(
 // CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VSET_LANE]]
 //
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
@@ -18785,7 +17284,7 @@ uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
 //
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
@@ -18795,7 +17294,7 @@ int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
 //
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
@@ -18805,7 +17304,7 @@ int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(
 // CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VSET_LANE]]
 //
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
@@ -18815,7 +17314,7 @@ int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
 // CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(
 // CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[VSET_LANE]]
 //
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
@@ -18825,7 +17324,7 @@ poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(
 // CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7
 // CHECK-NEXT:    ret <8 x i16> [[VSET_LANE]]
 //
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
@@ -18835,7 +17334,7 @@ poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
 // CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x float> [[B]], float [[A]], i32 3
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <4 x float> [[B]], float [[A]], i64 3
 // CHECK-NEXT:    ret <4 x float> [[VSET_LANE]]
 //
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
@@ -18845,18 +17344,11 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
 // CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__REINT_855:%.*]] = alloca half, align 2
-// CHECK-NEXT:    [[__REINT1_855:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    [[__REINT2_855:%.*]] = alloca <8 x i16>, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A]], align 2
-// CHECK-NEXT:    store half [[TMP0]], ptr [[__REINT_855]], align 2
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[__REINT1_855]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__REINT_855]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT1_855]], align 16
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP1]], i32 3
-// CHECK-NEXT:    store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_855]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__REINT2_855]], align 16
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP0]], i64 3
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[VSET_LANE]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
@@ -18865,7 +17357,7 @@ float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VSET_LANE]]
 //
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
@@ -18875,7 +17367,7 @@ int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0
 // CHECK-NEXT:    ret <1 x i64> [[VSET_LANE]]
 //
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
@@ -18885,7 +17377,7 @@ uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(
 // CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i64 1
 // CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
 //
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
@@ -18895,7 +17387,7 @@ int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(
 // CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i64 1
 // CHECK-NEXT:    ret <2 x i64> [[VSET_LANE]]
 //
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
@@ -18915,10 +17407,7 @@ int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vshl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
 //
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
@@ -18928,10 +17417,7 @@ int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vshl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
 //
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
@@ -18941,10 +17427,7 @@ int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vshl_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
 //
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
@@ -18964,10 +17447,7 @@ uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vshl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x i16> [[VSHL_V2_I]]
 //
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
@@ -18977,10 +17457,7 @@ uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vshl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x i32> [[VSHL_V2_I]]
 //
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
@@ -18990,10 +17467,7 @@ uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vshl_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]])
-// CHECK-NEXT:    [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <1 x i64> [[VSHL_V2_I]]
 //
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
@@ -19013,10 +17487,7 @@ int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vshlq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
 //
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
@@ -19026,10 +17497,7 @@ int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vshlq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
 //
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
@@ -19039,10 +17507,7 @@ int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vshlq_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
 //
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
@@ -19062,10 +17527,7 @@ uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vshlq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x i16> [[VSHLQ_V2_I]]
 //
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
@@ -19075,10 +17537,7 @@ uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vshlq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x i32> [[VSHLQ_V2_I]]
 //
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
@@ -19088,10 +17547,7 @@ uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vshlq_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-// CHECK-NEXT:    [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <2 x i64> [[VSHLQ_V2_I]]
 //
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
@@ -19102,7 +17558,7 @@ uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
 //
 int16x8_t test_vshll_n_s8(int8x8_t a) {
@@ -19112,10 +17568,8 @@ int16x8_t test_vshll_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
 //
 int32x4_t test_vshll_n_s16(int16x4_t a) {
@@ -19125,10 +17579,8 @@ int32x4_t test_vshll_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
 //
 int64x2_t test_vshll_n_s32(int32x2_t a) {
@@ -19139,7 +17591,7 @@ int64x2_t test_vshll_n_s32(int32x2_t a) {
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1)
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHLL_N]]
 //
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
@@ -19149,10 +17601,8 @@ uint16x8_t test_vshll_n_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHLL_N]]
 //
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
@@ -19162,10 +17612,8 @@ uint32x4_t test_vshll_n_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHLL_N]]
 //
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
@@ -19185,9 +17633,7 @@ int8x8_t test_vshl_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
 //
 int16x4_t test_vshl_n_s16(int16x4_t a) {
@@ -19197,9 +17643,7 @@ int16x4_t test_vshl_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
 //
 int32x2_t test_vshl_n_s32(int32x2_t a) {
@@ -19209,9 +17653,7 @@ int32x2_t test_vshl_n_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
 //
 int64x1_t test_vshl_n_s64(int64x1_t a) {
@@ -19231,9 +17673,7 @@ uint8x8_t test_vshl_n_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <4 x i16> [[VSHL_N]]
 //
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
@@ -19243,9 +17683,7 @@ uint16x4_t test_vshl_n_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VSHL_N]]
 //
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
@@ -19255,9 +17693,7 @@ uint32x2_t test_vshl_n_u32(uint32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[VSHL_N]]
 //
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
@@ -19277,9 +17713,7 @@ int8x16_t test_vshlq_n_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
 //
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
@@ -19289,9 +17723,7 @@ int16x8_t test_vshlq_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
 //
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
@@ -19301,9 +17733,7 @@ int32x4_t test_vshlq_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
 //
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
@@ -19323,9 +17753,7 @@ uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
 //
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
@@ -19335,9 +17763,7 @@ uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
 //
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
@@ -19347,9 +17773,7 @@ uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHL_N]]
 //
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
@@ -19359,10 +17783,8 @@ uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
 //
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
@@ -19372,10 +17794,8 @@ int8x8_t test_vshrn_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
 //
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
@@ -19385,10 +17805,8 @@ int16x4_t test_vshrn_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
 //
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
@@ -19398,10 +17816,8 @@ int32x2_t test_vshrn_n_s64(int64x2_t a) {
 // CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VSHRN_N]]
 //
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
@@ -19411,10 +17827,8 @@ uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VSHRN_N]]
 //
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
@@ -19424,10 +17838,8 @@ uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
-// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 1)
+// CHECK-NEXT:    [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VSHRN_N]]
 //
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
@@ -19447,9 +17859,7 @@ int8x8_t test_vshr_n_s8(int8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
 //
 int16x4_t test_vshr_n_s16(int16x4_t a) {
@@ -19459,9 +17869,7 @@ int16x4_t test_vshr_n_s16(int16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
 //
 int32x2_t test_vshr_n_s32(int32x2_t a) {
@@ -19471,9 +17879,7 @@ int32x2_t test_vshr_n_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <1 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
 //
 int64x1_t test_vshr_n_s64(int64x1_t a) {
@@ -19493,9 +17899,7 @@ uint8x8_t test_vshr_n_u8(uint8x8_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <4 x i16> [[VSHR_N]]
 //
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
@@ -19505,9 +17909,7 @@ uint16x4_t test_vshr_n_u16(uint16x4_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <2 x i32> [[VSHR_N]]
 //
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
@@ -19517,9 +17919,7 @@ uint32x2_t test_vshr_n_u32(uint32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <1 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <1 x i64> [[VSHR_N]]
 //
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
@@ -19539,9 +17939,7 @@ int8x16_t test_vshrq_n_s8(int8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <8 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
 //
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
@@ -19551,9 +17949,7 @@ int16x8_t test_vshrq_n_s16(int16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <4 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
 //
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
@@ -19563,9 +17959,7 @@ int32x4_t test_vshrq_n_s32(int32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = ashr <2 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
 //
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
@@ -19585,9 +17979,7 @@ uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
 // CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <8 x i16> [[A]], splat (i16 1)
 // CHECK-NEXT:    ret <8 x i16> [[VSHR_N]]
 //
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
@@ -19597,9 +17989,7 @@ uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
 // CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <4 x i32> [[A]], splat (i32 1)
 // CHECK-NEXT:    ret <4 x i32> [[VSHR_N]]
 //
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
@@ -19609,9 +17999,7 @@ uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
 // CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1)
+// CHECK-NEXT:    [[VSHR_N:%.*]] = lshr <2 x i64> [[A]], splat (i64 1)
 // CHECK-NEXT:    ret <2 x i64> [[VSHR_N]]
 //
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
@@ -19631,11 +18019,7 @@ int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
@@ -19645,11 +18029,7 @@ int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
 //
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
@@ -19659,11 +18039,7 @@ int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
 //
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
@@ -19683,11 +18059,7 @@ uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
@@ -19697,11 +18069,7 @@ uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
 //
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
@@ -19711,11 +18079,7 @@ uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
 //
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
@@ -19735,11 +18099,7 @@ poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
@@ -19759,11 +18119,7 @@ int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
@@ -19773,11 +18129,7 @@ int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
 //
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
@@ -19787,11 +18139,7 @@ int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
 //
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
@@ -19811,11 +18159,7 @@ uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
@@ -19825,11 +18169,7 @@ uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 1))
 // CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
 //
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
@@ -19839,11 +18179,7 @@ uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 1))
 // CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
 //
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
@@ -19863,11 +18199,7 @@ poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
@@ -19888,13 +18220,9 @@ int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i16> [[B]], splat (i16 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
@@ -19903,13 +18231,9 @@ int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i32> [[B]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
 //
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
@@ -19918,13 +18242,9 @@ int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <1 x i64> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <1 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
@@ -19944,13 +18264,9 @@ uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <4 x i16> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i16> [[B]], splat (i16 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 //
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
@@ -19959,13 +18275,9 @@ uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i32> [[B]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
 //
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
@@ -19974,13 +18286,9 @@ uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <1 x i64> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <1 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
 //
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
@@ -20000,13 +18308,9 @@ int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <8 x i16> [[B]], splat (i16 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
@@ -20015,13 +18319,9 @@ int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <4 x i32> [[B]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
@@ -20030,13 +18330,9 @@ int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = ashr <2 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
@@ -20056,13 +18352,9 @@ uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <8 x i16> [[B]], splat (i16 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
@@ -20071,13 +18363,9 @@ uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <4 x i32> [[B]], splat (i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
@@ -20086,13 +18374,9 @@ uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1)
-// CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
-// CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+// CHECK-NEXT:    [[VSRA_N:%.*]] = lshr <2 x i64> [[B]], splat (i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]]
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
 //
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
@@ -20111,11 +18395,7 @@ int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
@@ -20125,11 +18405,7 @@ int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
 //
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
@@ -20139,11 +18415,7 @@ int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
 //
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
@@ -20163,11 +18435,7 @@ uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
@@ -20177,11 +18445,7 @@ uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <2 x i32> [[VSLI_N2]]
 //
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
@@ -20191,11 +18455,7 @@ uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <1 x i64> [[VSLI_N2]]
 //
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
@@ -20215,11 +18475,7 @@ poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <4 x i16> [[VSLI_N2]]
 //
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
@@ -20239,11 +18495,7 @@ int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
@@ -20253,11 +18505,7 @@ int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
 //
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
@@ -20267,11 +18515,7 @@ int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
 //
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
@@ -20291,11 +18535,7 @@ uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
@@ -20305,11 +18545,7 @@ uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 -1))
 // CHECK-NEXT:    ret <4 x i32> [[VSLI_N2]]
 //
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
@@ -20319,11 +18555,7 @@ uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 -1))
 // CHECK-NEXT:    ret <2 x i64> [[VSLI_N2]]
 //
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
@@ -20343,11 +18575,7 @@ poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1))
+// CHECK-NEXT:    [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1))
 // CHECK-NEXT:    ret <8 x i16> [[VSLI_N2]]
 //
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
@@ -20367,9 +18595,7 @@ void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
@@ -20379,9 +18605,7 @@ void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
@@ -20391,9 +18615,7 @@ void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
@@ -20413,9 +18635,7 @@ void test_vst1q_s8(int8_t * a, int8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_s16(int16_t * a, int16x8_t b) {
@@ -20425,9 +18645,7 @@ void test_vst1q_s16(int16_t * a, int16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_s32(int32_t * a, int32x4_t b) {
@@ -20437,9 +18655,7 @@ void test_vst1q_s32(int32_t * a, int32x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_s64(int64_t * a, int64x2_t b) {
@@ -20449,9 +18665,7 @@ void test_vst1q_s64(int64_t * a, int64x2_t b) {
 // CHECK-LABEL: define void @test_vst1q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8f16(ptr [[A]], <8 x half> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8f16(ptr [[A]], <8 x half> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
@@ -20461,9 +18675,7 @@ void test_vst1q_f16(float16_t * a, float16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f32(ptr [[A]], <4 x float> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f32(ptr [[A]], <4 x float> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_f32(float32_t * a, float32x4_t b) {
@@ -20483,9 +18695,7 @@ void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
@@ -20505,9 +18715,7 @@ void test_vst1_u8(uint8_t * a, uint8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
@@ -20517,9 +18725,7 @@ void test_vst1_u16(uint16_t * a, uint16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
@@ -20529,9 +18735,7 @@ void test_vst1_u32(uint32_t * a, uint32x2_t b) {
 // CHECK-LABEL: define void @test_vst1_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
@@ -20551,9 +18755,7 @@ void test_vst1_s8(int8_t * a, int8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_s16(int16_t * a, int16x4_t b) {
@@ -20563,9 +18765,7 @@ void test_vst1_s16(int16_t * a, int16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_s32(int32_t * a, int32x2_t b) {
@@ -20575,9 +18775,7 @@ void test_vst1_s32(int32_t * a, int32x2_t b) {
 // CHECK-LABEL: define void @test_vst1_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_s64(int64_t * a, int64x1_t b) {
@@ -20587,9 +18785,7 @@ void test_vst1_s64(int64_t * a, int64x1_t b) {
 // CHECK-LABEL: define void @test_vst1_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f16(ptr [[A]], <4 x half> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4f16(ptr [[A]], <4 x half> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_f16(float16_t * a, float16x4_t b) {
@@ -20599,9 +18795,7 @@ void test_vst1_f16(float16_t * a, float16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2f32(ptr [[A]], <2 x float> [[TMP1]], i32 4)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v2f32(ptr [[A]], <2 x float> [[B]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_f32(float32_t * a, float32x2_t b) {
@@ -20621,9 +18815,7 @@ void test_vst1_p8(poly8_t * a, poly8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2)
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
@@ -20633,7 +18825,7 @@ void test_vst1_p16(poly16_t * a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20644,10 +18836,8 @@ void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
@@ -20657,10 +18847,8 @@ void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
@@ -20670,10 +18858,8 @@ void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> <i32 1>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4)
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
@@ -20683,7 +18869,7 @@ void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20694,10 +18880,8 @@ void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
@@ -20707,10 +18891,8 @@ void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
@@ -20720,10 +18902,8 @@ void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> <i32 1>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4)
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> <i32 1>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
@@ -20733,10 +18913,8 @@ void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
-// CHECK-NEXT:    store half [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[B]], i64 7
+// CHECK-NEXT:    store half [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
@@ -20746,10 +18924,8 @@ void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-// CHECK-NEXT:    store float [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[B]], i64 3
+// CHECK-NEXT:    store float [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
@@ -20759,7 +18935,7 @@ void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20770,10 +18946,8 @@ void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
@@ -20783,7 +18957,7 @@ void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20794,10 +18968,8 @@ void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
@@ -20807,10 +18979,8 @@ void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
@@ -20820,10 +18990,8 @@ void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
-// CHECK-NEXT:    store i64 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
@@ -20833,7 +19001,7 @@ void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20844,10 +19012,8 @@ void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
@@ -20857,10 +19023,8 @@ void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
@@ -20870,10 +19034,8 @@ void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
-// CHECK-NEXT:    store i64 [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
@@ -20883,10 +19045,8 @@ void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
-// CHECK-NEXT:    store half [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[B]], i64 3
+// CHECK-NEXT:    store half [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
@@ -20896,10 +19056,8 @@ void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-// CHECK-NEXT:    store float [[TMP2]], ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[B]], i64 1
+// CHECK-NEXT:    store float [[TMP0]], ptr [[A]], align 4
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
@@ -20909,7 +19067,7 @@ void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
 // CHECK-NEXT:    ret void
 //
@@ -20920,10 +19078,8 @@ void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-// CHECK-NEXT:    store i16 [[TMP2]], ptr [[A]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
 // CHECK-NEXT:    ret void
 //
 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
@@ -20933,17 +19089,16 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vst2q_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -20954,22 +19109,17 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
@@ -20979,22 +19129,17 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
@@ -21004,17 +19149,16 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21025,22 +19169,17 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
@@ -21050,22 +19189,17 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
@@ -21075,22 +19209,17 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
@@ -21100,22 +19229,17 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
@@ -21125,17 +19249,16 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21146,22 +19269,17 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
@@ -21171,17 +19289,10 @@ void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21192,22 +19303,11 @@ void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
@@ -21217,22 +19317,11 @@ void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
@@ -21242,22 +19331,11 @@ void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
@@ -21267,17 +19345,10 @@ void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
 // CHECK-LABEL: define void @test_vst2_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21288,22 +19359,11 @@ void test_vst2_s8(int8_t * a, int8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
@@ -21313,22 +19373,11 @@ void test_vst2_s16(int16_t * a, int16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
@@ -21338,22 +19387,11 @@ void test_vst2_s32(int32_t * a, int32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
@@ -21363,22 +19401,11 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
 // CHECK-LABEL: define void @test_vst2_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
@@ -21388,22 +19415,11 @@ void test_vst2_f16(float16_t * a, float16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
@@ -21413,17 +19429,10 @@ void test_vst2_f32(float32_t * a, float32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21434,22 +19443,11 @@ void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
@@ -21459,22 +19457,17 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
@@ -21484,22 +19477,17 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
@@ -21509,22 +19497,17 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
@@ -21534,22 +19517,17 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
@@ -21559,22 +19537,17 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
@@ -21584,22 +19557,17 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
@@ -21609,22 +19577,17 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2q_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
@@ -21634,17 +19597,10 @@ void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21655,22 +19611,11 @@ void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
@@ -21680,22 +19625,11 @@ void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
@@ -21705,17 +19639,10 @@ void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21726,22 +19653,11 @@ void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
@@ -21751,22 +19667,11 @@ void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
@@ -21776,22 +19681,11 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
@@ -21801,22 +19695,11 @@ void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
@@ -21826,17 +19709,10 @@ void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21847,22 +19723,11 @@ void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
 // CHECK-LABEL: define void @test_vst2_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X2_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
@@ -21872,20 +19737,21 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
 // CHECK-LABEL: define void @test_vst3q_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21896,27 +19762,22 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
@@ -21926,27 +19787,22 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
@@ -21956,20 +19812,21 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -21980,27 +19837,22 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
@@ -22010,27 +19862,22 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
@@ -22040,27 +19887,22 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8f16(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
@@ -22070,27 +19912,22 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f32(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
@@ -22100,20 +19937,21 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22124,27 +19962,22 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
@@ -22154,20 +19987,12 @@ void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22178,27 +20003,13 @@ void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
@@ -22208,27 +20019,13 @@ void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
@@ -22238,27 +20035,13 @@ void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
@@ -22268,20 +20051,12 @@ void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
 // CHECK-LABEL: define void @test_vst3_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22292,27 +20067,13 @@ void test_vst3_s8(int8_t * a, int8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
@@ -22322,27 +20083,13 @@ void test_vst3_s16(int16_t * a, int16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
@@ -22352,27 +20099,13 @@ void test_vst3_s32(int32_t * a, int32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
@@ -22382,27 +20115,13 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
 // CHECK-LABEL: define void @test_vst3_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
@@ -22412,27 +20131,13 @@ void test_vst3_f16(float16_t * a, float16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
@@ -22442,20 +20147,12 @@ void test_vst3_f32(float32_t * a, float32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22466,27 +20163,13 @@ void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
@@ -22496,27 +20179,22 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
@@ -22526,27 +20204,22 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
@@ -22556,27 +20229,22 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
@@ -22586,27 +20254,22 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
@@ -22616,27 +20279,22 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
@@ -22646,27 +20304,22 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
@@ -22676,27 +20329,22 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3q_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [6 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
@@ -22706,20 +20354,12 @@ void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22730,27 +20370,13 @@ void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
@@ -22760,27 +20386,13 @@ void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
@@ -22790,20 +20402,12 @@ void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22814,27 +20418,13 @@ void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
@@ -22844,27 +20434,13 @@ void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
@@ -22874,27 +20450,13 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
@@ -22904,27 +20466,13 @@ void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
@@ -22934,20 +20482,12 @@ void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -22958,27 +20498,13 @@ void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
 // CHECK-LABEL: define void @test_vst3_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X3_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
@@ -22988,23 +20514,26 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
 // CHECK-LABEL: define void @test_vst4q_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23015,32 +20544,27 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
@@ -23050,32 +20574,27 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
@@ -23085,23 +20604,26 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23112,32 +20634,27 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
@@ -23147,32 +20664,27 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
@@ -23182,32 +20694,27 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8f16(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
@@ -23217,32 +20724,27 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f32(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
@@ -23252,23 +20754,26 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X16X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23279,32 +20784,27 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
@@ -23314,23 +20814,14 @@ void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23341,32 +20832,15 @@ void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
@@ -23376,32 +20850,15 @@ void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
@@ -23411,32 +20868,15 @@ void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_u64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_3_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
@@ -23446,23 +20886,14 @@ void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
 // CHECK-LABEL: define void @test_vst4_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23473,32 +20904,15 @@ void test_vst4_s8(int8_t * a, int8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
@@ -23508,32 +20922,15 @@ void test_vst4_s16(int16_t * a, int16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
@@ -23543,32 +20940,15 @@ void test_vst4_s32(int32_t * a, int32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_s64(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT64X1X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_3_EXTRACT]], i64 0
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
@@ -23578,32 +20958,15 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
 // CHECK-LABEL: define void @test_vst4_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
@@ -23613,32 +20976,15 @@ void test_vst4_f16(float16_t * a, float16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
@@ -23648,23 +20994,14 @@ void test_vst4_f32(float32_t * a, float32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23675,32 +21012,15 @@ void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
@@ -23710,32 +21030,27 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
@@ -23745,32 +21060,27 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
@@ -23780,32 +21090,27 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
@@ -23815,32 +21120,27 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
@@ -23850,32 +21150,27 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
@@ -23885,32 +21180,27 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i32 3, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
@@ -23920,32 +21210,27 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4q_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [8 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 16
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 7, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4
+// CHECK-NEXT:    [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5
+// CHECK-NEXT:    [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1
+// CHECK-NEXT:    [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6
+// CHECK-NEXT:    [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0
+// CHECK-NEXT:    [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7
+// CHECK-NEXT:    [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
@@ -23955,23 +21240,14 @@ void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_u8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -23982,32 +21258,15 @@ void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_u16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
@@ -24017,32 +21276,15 @@ void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_u32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
@@ -24052,23 +21294,14 @@ void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_s8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -24079,32 +21312,15 @@ void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_s16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
@@ -24114,32 +21330,15 @@ void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_s32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_INT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
@@ -24149,32 +21348,15 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
@@ -24184,32 +21366,15 @@ void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_f32(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT32X2X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 1, i32 4)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
@@ -24219,23 +21384,14 @@ void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_p8(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8>
 // CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -24246,32 +21402,15 @@ void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
 // CHECK-LABEL: define void @test_vst4_lane_p16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
-// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false)
-// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY16X4X4_T]], ptr [[__S1]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2)
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2)
 // CHECK-NEXT:    ret void
 //
 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
@@ -24461,11 +21600,9 @@ uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
 //
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
@@ -24475,11 +21612,9 @@ int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
 //
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
@@ -24489,11 +21624,9 @@ int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
 //
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
@@ -24503,11 +21636,9 @@ int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8>
 // CHECK-NEXT:    ret <8 x i8> [[VSUBHN2_I]]
 //
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
@@ -24517,11 +21648,9 @@ uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VSUBHN2_I]]
 //
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
@@ -24531,11 +21660,9 @@ uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]]
 // CHECK-NEXT:    [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32)
-// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK-NEXT:    [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VSUBHN2_I]]
 //
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
@@ -24547,7 +21674,7 @@ uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
 //
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
@@ -24557,11 +21684,9 @@ int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsubl_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
@@ -24571,11 +21696,9 @@ int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsubl_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
@@ -24587,7 +21710,7 @@ int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
 //
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
@@ -24597,11 +21720,9 @@ uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsubl_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
@@ -24611,11 +21732,9 @@ uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsubl_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
@@ -24636,7 +21755,6 @@ int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsubw_s16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -24648,7 +21766,6 @@ int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsubw_s32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -24671,7 +21788,6 @@ uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vsubw_u16(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
@@ -24683,7 +21799,6 @@ uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i64> @test_vsubw_u32(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64>
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
@@ -24725,17 +21840,11 @@ poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl2_u8(
 // CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
@@ -24745,17 +21854,11 @@ uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl2_s8(
 // CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
@@ -24765,17 +21868,11 @@ int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl2_p8(
 // CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
@@ -24785,19 +21882,13 @@ poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl3_u8(
 // CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
@@ -24807,19 +21898,13 @@ uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl3_s8(
 // CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
@@ -24829,19 +21914,13 @@ int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl3_p8(
 // CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
@@ -24851,21 +21930,15 @@ poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl4_u8(
 // CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
@@ -24875,21 +21948,15 @@ uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl4_s8(
 // CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
@@ -24899,21 +21966,15 @@ int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
 // CHECK-LABEL: define <8 x i8> @test_vtbl4_p8(
 // CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[A_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
@@ -24953,17 +22014,11 @@ poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
@@ -24973,17 +22028,11 @@ uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
@@ -24993,17 +22042,11 @@ int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [2 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X2_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [2 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
@@ -25013,19 +22056,13 @@ poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
@@ -25035,19 +22072,13 @@ uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
@@ -25057,19 +22088,13 @@ int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [3 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X3_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [3 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
@@ -25079,21 +22104,15 @@ poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_UINT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
@@ -25103,21 +22122,15 @@ uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_INT8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
@@ -25127,21 +22140,15 @@ int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
 // CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
-// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    store [4 x i64] [[B_COERCE]], ptr [[COERCE_DIVE]], align 8
-// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_POLY8X8X4_T]], ptr [[B]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8
-// CHECK-NEXT:    store [4 x i64] [[TMP0]], ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8>
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
@@ -25151,10 +22158,9 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define void @test_vtrn_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]]
 // CHECK-NEXT:    ret void
@@ -25166,14 +22172,11 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define void @test_vtrn_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]]
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META6]]
 // CHECK-NEXT:    ret void
 //
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
@@ -25183,14 +22186,11 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define void @test_vtrn_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]]
+// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META9]]
 // CHECK-NEXT:    ret void
 //
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
@@ -25200,10 +22200,9 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define void @test_vtrn_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]]
 // CHECK-NEXT:    ret void
@@ -25215,14 +22214,11 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define void @test_vtrn_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]]
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META15]]
 // CHECK-NEXT:    ret void
 //
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
@@ -25232,14 +22228,11 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define void @test_vtrn_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]]
+// CHECK-NEXT:    store <2 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META18]]
 // CHECK-NEXT:    ret void
 //
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
@@ -25249,14 +22242,11 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define void @test_vtrn_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META21]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META21:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x float> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META21]]
+// CHECK-NEXT:    store <2 x float> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META21]]
 // CHECK-NEXT:    ret void
 //
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
@@ -25266,10 +22256,9 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define void @test_vtrn_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META24]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META24:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META24]]
 // CHECK-NEXT:    ret void
@@ -25281,14 +22270,11 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define void @test_vtrn_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META27]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META27:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META27]]
+// CHECK-NEXT:    store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META27]]
 // CHECK-NEXT:    ret void
 //
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
@@ -25298,10 +22284,9 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vtrnq_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META30]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META30:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META30]]
 // CHECK-NEXT:    ret void
@@ -25313,14 +22298,11 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define void @test_vtrnq_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META33]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META33:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META33]]
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META33]]
 // CHECK-NEXT:    ret void
 //
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
@@ -25330,14 +22312,11 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define void @test_vtrnq_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META36]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META36:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META36]]
+// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META36]]
 // CHECK-NEXT:    ret void
 //
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
@@ -25347,10 +22326,9 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define void @test_vtrnq_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META39:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META39]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META39:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META39]]
 // CHECK-NEXT:    ret void
@@ -25362,14 +22340,11 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define void @test_vtrnq_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META42:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META42]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META42:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META42]]
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META42]]
 // CHECK-NEXT:    ret void
 //
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
@@ -25379,14 +22354,11 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define void @test_vtrnq_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META45]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META45:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META45]]
+// CHECK-NEXT:    store <4 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META45]]
 // CHECK-NEXT:    ret void
 //
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
@@ -25396,14 +22368,11 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define void @test_vtrnq_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META48:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META48]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META48:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x float> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META48]]
+// CHECK-NEXT:    store <4 x float> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META48]]
 // CHECK-NEXT:    ret void
 //
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
@@ -25413,10 +22382,9 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define void @test_vtrnq_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META51:![0-9]+]])
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META51]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META51:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META51]]
 // CHECK-NEXT:    ret void
@@ -25428,14 +22396,11 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define void @test_vtrnq_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META54:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META54]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META54:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META54]]
+// CHECK-NEXT:    store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META54]]
 // CHECK-NEXT:    ret void
 //
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
@@ -25457,11 +22422,9 @@ uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vtst_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
 //
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
@@ -25471,11 +22434,9 @@ uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vtst_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
 //
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
@@ -25497,11 +22458,9 @@ uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vtst_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
 //
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
@@ -25511,11 +22470,9 @@ uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define <2 x i32> @test_vtst_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 // CHECK-NEXT:    ret <2 x i32> [[VTST_I]]
 //
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
@@ -25537,11 +22494,9 @@ uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define <4 x i16> @test_vtst_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
 // CHECK-NEXT:    ret <4 x i16> [[VTST_I]]
 //
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
@@ -25563,11 +22518,9 @@ uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vtstq_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
 //
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
@@ -25577,11 +22530,9 @@ uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vtstq_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
 //
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
@@ -25603,11 +22554,9 @@ uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vtstq_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
 //
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
@@ -25617,11 +22566,9 @@ uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define <4 x i32> @test_vtstq_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VTST_I]]
 //
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
@@ -25643,11 +22590,9 @@ uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define <8 x i16> @test_vtstq_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
-// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
+// CHECK-NEXT:    [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VTST_I]]
 //
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
@@ -25657,10 +22602,9 @@ uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK-LABEL: define void @test_vuzp_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META57]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META57:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META57]]
 // CHECK-NEXT:    ret void
@@ -25672,14 +22616,11 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define void @test_vuzp_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META60]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META60:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META60]]
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META60]]
 // CHECK-NEXT:    ret void
 //
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
@@ -25689,14 +22630,11 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define void @test_vuzp_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META63:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META63]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META63:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META63]]
+// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META63]]
 // CHECK-NEXT:    ret void
 //
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
@@ -25706,10 +22644,9 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define void @test_vuzp_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META66:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META66]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META66:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META66]]
 // CHECK-NEXT:    ret void
@@ -25721,14 +22658,11 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define void @test_vuzp_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META69:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META69]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META69:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META69]]
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META69]]
 // CHECK-NEXT:    ret void
 //
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
@@ -25738,14 +22672,11 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define void @test_vuzp_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META72:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META72]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META72:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META72]]
+// CHECK-NEXT:    store <2 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META72]]
 // CHECK-NEXT:    ret void
 //
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
@@ -25755,14 +22686,11 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define void @test_vuzp_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META75:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META75]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META75:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x float> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META75]]
+// CHECK-NEXT:    store <2 x float> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META75]]
 // CHECK-NEXT:    ret void
 //
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
@@ -25772,10 +22700,9 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define void @test_vuzp_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META78]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META78:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META78]]
 // CHECK-NEXT:    ret void
@@ -25787,14 +22714,11 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define void @test_vuzp_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META81:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META81]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META81:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META81]]
+// CHECK-NEXT:    store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META81]]
 // CHECK-NEXT:    ret void
 //
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
@@ -25804,10 +22728,9 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vuzpq_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META84:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META84]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META84:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META84]]
 // CHECK-NEXT:    ret void
@@ -25819,14 +22742,11 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define void @test_vuzpq_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META87:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META87]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META87:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META87]]
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META87]]
 // CHECK-NEXT:    ret void
 //
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
@@ -25836,14 +22756,11 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define void @test_vuzpq_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META90]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META90:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META90]]
+// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META90]]
 // CHECK-NEXT:    ret void
 //
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
@@ -25853,10 +22770,9 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define void @test_vuzpq_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META93:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META93]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META93:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META93]]
 // CHECK-NEXT:    ret void
@@ -25868,14 +22784,11 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define void @test_vuzpq_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META96:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META96]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META96:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META96]]
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META96]]
 // CHECK-NEXT:    ret void
 //
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
@@ -25885,14 +22798,11 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define void @test_vuzpq_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META99:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META99]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META99:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META99]]
+// CHECK-NEXT:    store <4 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META99]]
 // CHECK-NEXT:    ret void
 //
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
@@ -25902,14 +22812,11 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define void @test_vuzpq_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK-NEXT:    store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META102]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META102:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK-NEXT:    store <4 x float> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META102]]
+// CHECK-NEXT:    store <4 x float> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META102]]
 // CHECK-NEXT:    ret void
 //
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
@@ -25919,10 +22826,9 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define void @test_vuzpq_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META105:![0-9]+]])
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META105]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META105:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META105]]
 // CHECK-NEXT:    ret void
@@ -25934,14 +22840,11 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define void @test_vuzpq_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META108:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META108]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META108:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META108]]
+// CHECK-NEXT:    store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META108]]
 // CHECK-NEXT:    ret void
 //
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
@@ -25951,10 +22854,9 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
 // CHECK-LABEL: define void @test_vzip_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META111:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META111]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META111:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META111]]
 // CHECK-NEXT:    ret void
@@ -25966,14 +22868,11 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define void @test_vzip_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META114:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META114]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META114:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META114]]
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META114]]
 // CHECK-NEXT:    ret void
 //
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
@@ -25983,14 +22882,11 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: define void @test_vzip_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META117:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META117]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META117:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META117]]
+// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META117]]
 // CHECK-NEXT:    ret void
 //
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
@@ -26000,10 +22896,9 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define void @test_vzip_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META120]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META120:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META120]]
 // CHECK-NEXT:    ret void
@@ -26015,14 +22910,11 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define void @test_vzip_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META123:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META123]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META123:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META123]]
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META123]]
 // CHECK-NEXT:    ret void
 //
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
@@ -26032,14 +22924,11 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: define void @test_vzip_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META126:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META126]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META126:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META126]]
+// CHECK-NEXT:    store <2 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META126]]
 // CHECK-NEXT:    ret void
 //
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
@@ -26049,14 +22938,11 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define void @test_vzip_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META129:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META129]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META129:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    store <2 x float> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META129]]
+// CHECK-NEXT:    store <2 x float> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META129]]
 // CHECK-NEXT:    ret void
 //
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
@@ -26066,10 +22952,9 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
 // CHECK-LABEL: define void @test_vzip_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META132:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META132]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META132:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 // CHECK-NEXT:    store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META132]]
 // CHECK-NEXT:    ret void
@@ -26081,14 +22966,11 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
 // CHECK-LABEL: define void @test_vzip_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META135:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META135]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META135:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META135]]
+// CHECK-NEXT:    store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META135]]
 // CHECK-NEXT:    ret void
 //
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
@@ -26098,10 +22980,9 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK-LABEL: define void @test_vzipq_s8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META138]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META138:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META138]]
 // CHECK-NEXT:    ret void
@@ -26113,14 +22994,11 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: define void @test_vzipq_s16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META141:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META141]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META141:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META141]]
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META141]]
 // CHECK-NEXT:    ret void
 //
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
@@ -26130,14 +23008,11 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: define void @test_vzipq_s32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META144:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META144]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META144:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META144]]
+// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META144]]
 // CHECK-NEXT:    ret void
 //
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
@@ -26147,10 +23022,9 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: define void @test_vzipq_u8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META147:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META147]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META147:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META147]]
 // CHECK-NEXT:    ret void
@@ -26162,14 +23036,11 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: define void @test_vzipq_u16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META150:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META150]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META150:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META150]]
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META150]]
 // CHECK-NEXT:    ret void
 //
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
@@ -26179,14 +23050,11 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: define void @test_vzipq_u32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META153:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META153]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META153:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META153]]
+// CHECK-NEXT:    store <4 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META153]]
 // CHECK-NEXT:    ret void
 //
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
@@ -26196,14 +23064,11 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: define void @test_vzipq_f32(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META156:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK-NEXT:    store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META156]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META156:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK-NEXT:    store <4 x float> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META156]]
+// CHECK-NEXT:    store <4 x float> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META156]]
 // CHECK-NEXT:    ret void
 //
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
@@ -26213,10 +23078,9 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: define void @test_vzipq_p8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META159:![0-9]+]])
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META159]]
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META159:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 // CHECK-NEXT:    store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META159]]
 // CHECK-NEXT:    ret void
@@ -26228,14 +23092,11 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK-LABEL: define void @test_vzipq_p16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META162:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META162]]
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META162:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16
 // CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META162]]
+// CHECK-NEXT:    store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META162]]
 // CHECK-NEXT:    ret void
 //
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
diff --git a/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll b/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll
new file mode 100644
index 00000000000000..26642b5ae82082
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vrndaq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vrnda1.i = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #1
+  ret <2 x double> %vrnda1.i
+}
+
+define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vrndpq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vrndp1.i = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #1
+  ret <2 x double> %vrndp1.i
+}
+
+define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x float> %vsqrt.i
+}
+
+define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <2 x double> %vsqrt.i
+}
+
+attributes #0 = { noinline nounwind strictfp "target-features"="+neon" }
+attributes #1 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll b/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll
new file mode 100644
index 00000000000000..ae7d09d42c02fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vrndaq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vrnda1.i = call <2 x double> @llvm.round.v2f64(<2 x double> %a)
+  ret <2 x double> %vrnda1.i
+}
+
+define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vrndpq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vrndp1.i = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
+  ret <2 x double> %vrndp1.i
+}
+
+define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
+  ret <4 x float> %vsqrt.i
+}
+
+define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
+  ret <2 x double> %vsqrt.i
+}
+
+attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll
new file mode 100644
index 00000000000000..b683682901e8fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+; CHECK-LABEL: test_vfmas_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <2 x float> %c, i64 1
+  %0 = call float @llvm.experimental.constrained.fma.f32(float %b, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %0
+}
+
+define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+; CHECK-LABEL: test_vfmad_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <1 x double> %c, i64 0
+  %0 = call double @llvm.experimental.constrained.fma.f64(double %b, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %0
+}
+
+define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+; CHECK-LABEL: test_vfmad_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <2 x double> %c, i64 1
+  %0 = call double @llvm.experimental.constrained.fma.f64(double %b, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %0
+}
+
+define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+; CHECK-LABEL: test_vfmss_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %c, i64 1
+  %0 = call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %0
+}
+
+define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+; CHECK-LABEL: test_vfma_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %fmla2 = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %b, <1 x double> %v, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <1 x double> %fmla2
+}
+
+define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+; CHECK-LABEL: test_vfms_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <1 x double> %b
+  %fmla2 = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %fneg, <1 x double> %v, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <1 x double> %fmla2
+}
+
+define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+; CHECK-LABEL: test_vfma_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %0 = extractelement <1 x double> %a, i64 0
+  %1 = extractelement <1 x double> %b, i64 0
+  %extract = extractelement <2 x double> %v, i64 0
+  %2 = call double @llvm.experimental.constrained.fma.f64(double %1, double %extract, double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %3 = bitcast double %2 to <1 x double>
+  ret <1 x double> %3
+}
+
+define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+; CHECK-LABEL: test_vfms_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
+entry:
+  %0 = extractelement <1 x double> %a, i64 0
+  %1 = extractelement <1 x double> %b, i64 0
+  %2 = fneg double %1
+  %extract = extractelement <2 x double> %v, i64 0
+  %3 = call double @llvm.experimental.constrained.fma.f64(double %2, double %extract, double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %4 = bitcast double %3 to <1 x double>
+  ret <1 x double> %4
+}
+
+attributes #0 = { noinline nounwind strictfp "target-cpu"="cyclone" "target-features"="+neon" }
+attributes #1 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll
new file mode 100644
index 00000000000000..eb3f572ef7072f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+; CHECK-LABEL: test_vfmas_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <2 x float> %c, i64 1
+  %0 = call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+; CHECK-LABEL: test_vfmad_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <1 x double> %c, i64 0
+  %0 = call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+; CHECK-LABEL: test_vfmad_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla d0, d1, v2.d[1]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <2 x double> %c, i64 1
+  %0 = call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+; CHECK-LABEL: test_vfmss_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls s0, s1, v2.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %c, i64 1
+  %0 = call float @llvm.fma.f32(float %fneg, float %extract, float %a)
+  ret float %0
+}
+
+define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+; CHECK-LABEL: test_vfma_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %fmla2 = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %v, <1 x double> %a)
+  ret <1 x double> %fmla2
+}
+
+define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+; CHECK-LABEL: test_vfms_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <1 x double> %b
+  %fmla2 = call <1 x double> @llvm.fma.v1f64(<1 x double> %fneg, <1 x double> %v, <1 x double> %a)
+  ret <1 x double> %fmla2
+}
+
+define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+; CHECK-LABEL: test_vfma_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d1, d2, d0
+; CHECK-NEXT:    ret
+entry:
+  %0 = extractelement <1 x double> %a, i64 0
+  %1 = extractelement <1 x double> %b, i64 0
+  %extract = extractelement <2 x double> %v, i64 0
+  %2 = call double @llvm.fma.f64(double %1, double %extract, double %0)
+  %3 = bitcast double %2 to <1 x double>
+  ret <1 x double> %3
+}
+
+define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+; CHECK-LABEL: test_vfms_laneq_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d2, d1, d0
+; CHECK-NEXT:    ret
+entry:
+  %0 = extractelement <1 x double> %a, i64 0
+  %1 = extractelement <1 x double> %b, i64 0
+  %2 = fneg double %1
+  %extract = extractelement <2 x double> %v, i64 0
+  %3 = call double @llvm.fma.f64(double %2, double %extract, double %0)
+  %4 = bitcast double %3 to <1 x double>
+  ret <1 x double> %4
+}
+
+attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-cpu"="cyclone" "target-features"="+neon" }
+
diff --git a/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll
new file mode 100644
index 00000000000000..58f1accdf91d54
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define <4 x half> @test_vsqrt_f16(<4 x half> %a) #0 {
+; CHECK-LABEL: test_vsqrt_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %vsqrt.i
+}
+
+define <8 x half> @test_vsqrtq_f16(<8 x half> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %vsqrt.i
+}
+
+define <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.4h, v2.4h, v1.4h
+; CHECK-NEXT:    ret
+entry:
+  %fneg.i = fneg <4 x half> %b
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg.i, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %fneg.i = fneg <8 x half> %b
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg.i, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %fmla2
+}
+
+define <8 x half> @test_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %fmla2
+}
+
+define <4 x half> @test_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %b, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %b, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfma_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit = insertelement <4 x half> poison, half %c, i64 0
+  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfmaq_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit = insertelement <8 x half> poison, half %c, i64 0
+  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define half @test_vfmah_lane_f16(half %a, half %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmah_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <4 x half> %c, i64 3
+  %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %0
+}
+
+define half @test_vfmah_laneq_f16(half %a, half %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmah_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <8 x half> %c, i64 7
+  %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %0
+}
+
+define <4 x half> @test_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %fmla2
+}
+
+define <8 x half> @test_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %fmla2
+}
+
+define <4 x half> @test_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %fneg, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %fneg, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfms_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %vecinit = insertelement <4 x half> poison, half %c, i64 0
+  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer
+  %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfmsq_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %vecinit = insertelement <8 x half> poison, half %c, i64 0
+  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret <8 x half> %0
+}
+
+define half @test_vfmsh_lane_f16(half %a, half %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsh_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fneg s1, s1
+; CHECK-NEXT:    fcvt h1, s1
+; CHECK-NEXT:    fmla h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1
+  %fneg = fneg float %conv
+  %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %extract = extractelement <4 x half> %c, i64 3
+  %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %1
+}
+
+define half @test_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsh_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    fneg s1, s1
+; CHECK-NEXT:    fcvt h1, s1
+; CHECK-NEXT:    fmla h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1
+  %fneg = fneg float %conv
+  %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  %extract = extractelement <8 x half> %c, i64 7
+  %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %1
+}
+
+attributes #0 = { noinline nounwind strictfp "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }
+attributes #1 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll
new file mode 100644
index 00000000000000..d5f0e789ec8c38
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-none-linux-gnu"
+
+define <4 x half> @test_vsqrt_f16(<4 x half> %a) #0 {
+; CHECK-LABEL: test_vsqrt_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
+  ret <4 x half> %vsqrt.i
+}
+
+define <8 x half> @test_vsqrtq_f16(<8 x half> %a) #0 {
+; CHECK-LABEL: test_vsqrtq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt v0.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %vsqrt.i = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
+  ret <8 x half> %vsqrt.i
+}
+
+define <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.4h, v2.4h, v1.4h
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.4h, v2.4h, v1.4h
+; CHECK-NEXT:    ret
+entry:
+  %fneg.i = fneg <4 x half> %b
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg.i, <4 x half> %c, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %fneg.i = fneg <8 x half> %b
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg.i, <8 x half> %c, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane, <4 x half> %a)
+  ret <4 x half> %fmla2
+}
+
+define <8 x half> @test_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane, <8 x half> %a)
+  ret <8 x half> %fmla2
+}
+
+define <4 x half> @test_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfma_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %lane, <4 x half> %b, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmaq_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %lane, <8 x half> %b, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfma_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit = insertelement <4 x half> poison, half %c, i64 0
+  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfmaq_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %vecinit = insertelement <8 x half> poison, half %c, i64 0
+  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define half @test_vfmah_lane_f16(half %a, half %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmah_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <4 x half> %c, i64 3
+  %0 = call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
+define half @test_vfmah_laneq_f16(half %a, half %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmah_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <8 x half> %c, i64 7
+  %0 = call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
+define <4 x half> @test_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg, <4 x half> %lane, <4 x half> %a)
+  ret <4 x half> %fmla2
+}
+
+define <8 x half> @test_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %fmla2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg, <8 x half> %lane, <8 x half> %a)
+  ret <8 x half> %fmla2
+}
+
+define <4 x half> @test_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfms_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %lane, <4 x half> %fneg, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsq_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %lane, <8 x half> %fneg, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define <4 x half> @test_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfms_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <4 x half> %b
+  %vecinit = insertelement <4 x half> poison, half %c, i64 0
+  %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer
+  %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg, <4 x half> %vecinit3, <4 x half> %a)
+  ret <4 x half> %0
+}
+
+define <8 x half> @test_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 {
+; CHECK-LABEL: test_vfmsq_n_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
+; CHECK-NEXT:    ret
+entry:
+  %fneg = fneg <8 x half> %b
+  %vecinit = insertelement <8 x half> poison, half %c, i64 0
+  %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg, <8 x half> %vecinit7, <8 x half> %a)
+  ret <8 x half> %0
+}
+
+define half @test_vfmsh_lane_f16(half %a, half %b, <4 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsh_lane_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg half %b
+  %extract = extractelement <4 x half> %c, i64 3
+  %1 = call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
+define half @test_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c) #0 {
+; CHECK-LABEL: test_vfmsh_laneq_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg half %b
+  %extract = extractelement <8 x half> %c, i64 7
+  %1 = call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
+attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }

>From 27c8a1e77e90645013be2b1c93f3d6eb4cd2e11f Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 31 Dec 2024 14:57:32 +0000
Subject: [PATCH 03/11] Generate explicit bitcasts in NeonEmitter

---
 clang/include/clang/Basic/TargetBuiltins.h    |    4 +
 clang/include/clang/Basic/arm_neon.td         |   22 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  102 +-
 clang/lib/CodeGen/CodeGenFunction.h           |    8 +-
 .../CodeGen/AArch64/bf16-dotprod-intrinsics.c |   16 +-
 .../AArch64/bf16-reinterpret-intrinsics.c     |  216 +--
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 1328 ++++++++---------
 clang/test/CodeGen/AArch64/neon-misc.c        |    8 +-
 clang/test/CodeGen/AArch64/poly128.c          |  104 +-
 .../AArch64/v8.2a-neon-intrinsics-generic.c   |   16 +-
 .../CodeGen/arm-bf16-dotprod-intrinsics.c     |   16 +-
 clang/test/CodeGen/arm_neon_intrinsics.c      |  116 +-
 clang/utils/TableGen/NeonEmitter.cpp          |   24 +-
 13 files changed, 1018 insertions(+), 962 deletions(-)

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 914be3691ee812..47870bd1678c17 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -214,6 +214,10 @@ namespace clang {
       EltType ET = getEltType();
       return ET == Poly8 || ET == Poly16 || ET == Poly64;
     }
+    bool isFloatingPoint() const {
+      EltType ET = getEltType();
+      return ET == Float16 || ET == Float32 || ET == Float64 || ET == BFloat16;
+    }
     bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
     bool isQuad() const { return (Flags & QuadFlag) != 0; }
     unsigned getEltSizeInBits() const {
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..19cf6f1dbfb692 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -129,7 +129,7 @@ def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>;
 def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>;
 def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>;
 def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>;
-def OP_REINT    : Op<(cast "R", $p0)>;
+def OP_REINT    : Op<(bitcast "R", $p0)>;
 def OP_ADDHNHi  : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>;
 def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>;
 def OP_SUBHNHi  : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>;
@@ -929,12 +929,12 @@ def CFMLE  : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>;
 def CFMGT  : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>;
 def CFMLT  : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>;
 
-def CMEQ  : SInst<"vceqz", "U.",
+def CMEQ  : SInst<"vceqz", "U(.!)",
                   "csilfUcUsUiUlPcPlQcQsQiQlQfQUcQUsQUiQUlQPcdQdQPl">;
-def CMGE  : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLE  : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMGT  : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLT  : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">;
+def CMGE  : SInst<"vcgez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLE  : SInst<"vclez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMGT  : SInst<"vcgtz", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLT  : SInst<"vcltz", "U(.!)", "csilfdQcQsQiQlQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Max/Min Integer
@@ -1672,11 +1672,11 @@ let TargetGuard = "fullfp16,neon" in {
   // ARMv8.2-A FP16 one-operand vector intrinsics.
 
   // Comparison
-  def CMEQH    : SInst<"vceqz", "U.", "hQh">;
-  def CMGEH    : SInst<"vcgez", "U.", "hQh">;
-  def CMGTH    : SInst<"vcgtz", "U.", "hQh">;
-  def CMLEH    : SInst<"vclez", "U.", "hQh">;
-  def CMLTH    : SInst<"vcltz", "U.", "hQh">;
+  def CMEQH    : SInst<"vceqz", "U(.!)", "hQh">;
+  def CMGEH    : SInst<"vcgez", "U(.!)", "hQh">;
+  def CMGTH    : SInst<"vcgtz", "U(.!)", "hQh">;
+  def CMLEH    : SInst<"vclez", "U(.!)", "hQh">;
+  def CMLTH    : SInst<"vcltz", "U(.!)", "hQh">;
 
   // Vector conversion
   def VCVT_F16     : SInst<"vcvt_f16", "F(.!)",  "sUsQsQUs">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c419fb0cc055e0..b3c76986511444 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8158,8 +8158,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
 
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags Type(NeonTypeConst->getZExtValue());
-  bool Usgn = Type.isUnsigned();
-  bool Quad = Type.isQuad();
+  const bool Usgn = Type.isUnsigned();
+  const bool Quad = Type.isQuad();
+  const bool Floating = Type.isFloatingPoint();
   const bool HasLegalHalfType = getTarget().hasLegalHalfType();
   const bool AllowBFloatArgsAndRet =
       getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
@@ -8260,24 +8261,28 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   }
   case NEON::BI__builtin_neon_vceqz_v:
   case NEON::BI__builtin_neon_vceqzq_v:
-    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
-                                         ICmpInst::ICMP_EQ, "vceqz");
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz");
   case NEON::BI__builtin_neon_vcgez_v:
   case NEON::BI__builtin_neon_vcgezq_v:
-    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
-                                         ICmpInst::ICMP_SGE, "vcgez");
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
+        "vcgez");
   case NEON::BI__builtin_neon_vclez_v:
   case NEON::BI__builtin_neon_vclezq_v:
-    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
-                                         ICmpInst::ICMP_SLE, "vclez");
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
+        "vclez");
   case NEON::BI__builtin_neon_vcgtz_v:
   case NEON::BI__builtin_neon_vcgtzq_v:
-    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
-                                         ICmpInst::ICMP_SGT, "vcgtz");
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
+        "vcgtz");
   case NEON::BI__builtin_neon_vcltz_v:
   case NEON::BI__builtin_neon_vcltzq_v:
-    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
-                                         ICmpInst::ICMP_SLT, "vcltz");
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
+        "vcltz");
   case NEON::BI__builtin_neon_vclz_v:
   case NEON::BI__builtin_neon_vclzq_v:
     // We generate target-independent intrinsic, which needs a second argument
@@ -8840,28 +8845,32 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   return Builder.CreateBitCast(Result, ResultType, NameHint);
 }
 
-Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
-    Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
-    const CmpInst::Predicate Ip, const Twine &Name) {
-  llvm::Type *OTy = Op->getType();
-
-  // FIXME: this is utterly horrific. We should not be looking at previous
-  // codegen context to find out what needs doing. Unfortunately TableGen
-  // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
-  // (etc).
-  if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
-    OTy = BI->getOperand(0)->getType();
-
-  Op = Builder.CreateBitCast(Op, OTy);
-  if (OTy->getScalarType()->isFloatingPointTy()) {
-    if (Fp == CmpInst::FCMP_OEQ)
-      Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
+Value *
+CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
+                                               const CmpInst::Predicate Pred,
+                                               const Twine &Name) {
+
+  if (isa<FixedVectorType>(Ty)) {
+    // Vector types are cast to i8 vectors. Recover original type.
+    Op = Builder.CreateBitCast(Op, Ty);
+  }
+
+  if (CmpInst::isFPPredicate(Pred)) {
+    if (Pred == CmpInst::FCMP_OEQ)
+      Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
     else
-      Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
+      Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
   } else {
-    Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
+    Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
   }
-  return Builder.CreateSExt(Op, Ty, Name);
+
+  llvm::Type *ResTy = Ty;
+  if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
+    ResTy = FixedVectorType::get(
+        IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()),
+        VTy->getNumElements());
+
+  return Builder.CreateSExt(Op, ResTy, Name);
 }
 
 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
@@ -12350,45 +12359,66 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vceqzd_s64:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType(getContext())),
+        ICmpInst::ICMP_EQ, "vceqz");
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
   case NEON::BI__builtin_neon_vceqzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
-        ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
+        ICmpInst::FCMP_OEQ, "vceqz");
   case NEON::BI__builtin_neon_vcgezd_s64:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType(getContext())),
+        ICmpInst::ICMP_SGE, "vcgez");
   case NEON::BI__builtin_neon_vcgezd_f64:
   case NEON::BI__builtin_neon_vcgezs_f32:
   case NEON::BI__builtin_neon_vcgezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
-        ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
+        ICmpInst::FCMP_OGE, "vcgez");
   case NEON::BI__builtin_neon_vclezd_s64:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType(getContext())),
+        ICmpInst::ICMP_SLE, "vclez");
   case NEON::BI__builtin_neon_vclezd_f64:
   case NEON::BI__builtin_neon_vclezs_f32:
   case NEON::BI__builtin_neon_vclezh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
-        ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
+        ICmpInst::FCMP_OLE, "vclez");
   case NEON::BI__builtin_neon_vcgtzd_s64:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType(getContext())),
+        ICmpInst::ICMP_SGT, "vcgtz");
   case NEON::BI__builtin_neon_vcgtzd_f64:
   case NEON::BI__builtin_neon_vcgtzs_f32:
   case NEON::BI__builtin_neon_vcgtzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
-        ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
+        ICmpInst::FCMP_OGT, "vcgtz");
   case NEON::BI__builtin_neon_vcltzd_s64:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType(getContext())),
+        ICmpInst::ICMP_SLT, "vcltz");
+
   case NEON::BI__builtin_neon_vcltzd_f64:
   case NEON::BI__builtin_neon_vcltzs_f32:
   case NEON::BI__builtin_neon_vcltzh_f16:
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
-        ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
+        ICmpInst::FCMP_OLT, "vcltz");
 
   case NEON::BI__builtin_neon_vceqzd_u64: {
     Ops.push_back(EmitScalarExpr(E->getArg(0)));
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 1a5c42f8f974d0..d1bec166a435e2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4628,10 +4628,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                      ReturnValueSlot ReturnValue);
 
-  llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
-                                             const llvm::CmpInst::Predicate Fp,
-                                             const llvm::CmpInst::Predicate Ip,
-                                             const llvm::Twine &Name = "");
+  llvm::Value *
+  EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
+                                const llvm::CmpInst::Predicate Pred,
+                                const llvm::Twine &Name = "");
   llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                   ReturnValueSlot ReturnValue,
                                   llvm::Triple::ArchType Arch);
diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
index caa803ee794603..6da2762782acb9 100644
--- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
@@ -28,8 +28,8 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -40,8 +40,8 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -52,8 +52,8 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -64,8 +64,8 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
diff --git a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
index 62d2ca48f3ffea..1851e6fadf9a86 100644
--- a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c
@@ -11,378 +11,378 @@
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s8(int8x8_t a)      { return vreinterpret_bf16_s8(a);    }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s16(int16x4_t a)    { return vreinterpret_bf16_s16(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s32(int32x2_t a)    { return vreinterpret_bf16_s32(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_f32(float32x2_t a)  { return vreinterpret_bf16_f32(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u8(uint8x8_t a)     { return vreinterpret_bf16_u8(a);    }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u16(uint16x4_t a)   { return vreinterpret_bf16_u16(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u32(uint32x2_t a)   { return vreinterpret_bf16_u32(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p8(poly8x8_t a)     { return vreinterpret_bf16_p8(a);    }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p16(poly16x4_t a)   { return vreinterpret_bf16_p16(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_u64(uint64x1_t a)   { return vreinterpret_bf16_u64(a);   }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_s64(int64x1_t a)    { return vreinterpret_bf16_s64(a);   }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s8(int8x16_t a)    { return vreinterpretq_bf16_s8(a);   }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s16(int16x8_t a)   { return vreinterpretq_bf16_s16(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s32(int32x4_t a)   { return vreinterpretq_bf16_s32(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_f32(float32x4_t a) { return vreinterpretq_bf16_f32(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u8(uint8x16_t a)   { return vreinterpretq_bf16_u8(a);   }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u16(uint16x8_t a)  { return vreinterpretq_bf16_u16(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u32(uint32x4_t a)  { return vreinterpretq_bf16_u32(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p8(poly8x16_t a)   { return vreinterpretq_bf16_p8(a);   }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p16(poly16x8_t a)  { return vreinterpretq_bf16_p16(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_u64(uint64x2_t a)  { return vreinterpretq_bf16_u64(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_s64(int64x2_t a)   { return vreinterpretq_bf16_s64(a);  }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_p64(poly64x1_t a)   { return vreinterpret_bf16_p64(a);   }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p64(poly64x2_t a)  { return vreinterpretq_bf16_p64(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p128(
 // CHECK-SAME: i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_p128(poly128_t a)  { return vreinterpretq_bf16_p128(a); }
 // CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x bfloat>
-// CHECK-NEXT:    ret <4 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <4 x bfloat>
+// CHECK-NEXT:    ret <4 x bfloat> [[DOTCAST]]
 //
 bfloat16x4_t test_vreinterpret_bf16_f64(float64x1_t a)  { return vreinterpret_bf16_f64(a);  }
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x bfloat>
-// CHECK-NEXT:    ret <8 x bfloat> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <8 x bfloat>
+// CHECK-NEXT:    ret <8 x bfloat> [[DOTCAST]]
 //
 bfloat16x8_t test_vreinterpretq_bf16_f64(float64x2_t a) { return vreinterpretq_bf16_f64(a); }
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t    test_vreinterpret_s8_bf16(bfloat16x4_t a)    { return vreinterpret_s8_bf16(a);    }
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t   test_vreinterpret_s16_bf16(bfloat16x4_t a)   { return vreinterpret_s16_bf16(a);   }
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t   test_vreinterpret_s32_bf16(bfloat16x4_t a)   { return vreinterpret_s32_bf16(a);   }
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_bf16(bfloat16x4_t a)   { return vreinterpret_f32_bf16(a);   }
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t   test_vreinterpret_u8_bf16(bfloat16x4_t a)    { return vreinterpret_u8_bf16(a);    }
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t  test_vreinterpret_u16_bf16(bfloat16x4_t a)   { return vreinterpret_u16_bf16(a);   }
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t  test_vreinterpret_u32_bf16(bfloat16x4_t a)   { return vreinterpret_u32_bf16(a);   }
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t   test_vreinterpret_p8_bf16(bfloat16x4_t a)    { return vreinterpret_p8_bf16(a);    }
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t  test_vreinterpret_p16_bf16(bfloat16x4_t a)   { return vreinterpret_p16_bf16(a);   }
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t  test_vreinterpret_u64_bf16(bfloat16x4_t a)   { return vreinterpret_u64_bf16(a);   }
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t   test_vreinterpret_s64_bf16(bfloat16x4_t a)   { return vreinterpret_s64_bf16(a);   }
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t  test_vreinterpret_p64_bf16(bfloat16x4_t a)   { return vreinterpret_p64_bf16(a);   }
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t   test_vreinterpretq_s8_bf16(bfloat16x8_t a)   { return vreinterpretq_s8_bf16(a);   }
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t   test_vreinterpretq_s16_bf16(bfloat16x8_t a)  { return vreinterpretq_s16_bf16(a);  }
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t   test_vreinterpretq_s32_bf16(bfloat16x8_t a)  { return vreinterpretq_s32_bf16(a);  }
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_bf16(bfloat16x8_t a)  { return vreinterpretq_f32_bf16(a);  }
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t  test_vreinterpretq_u8_bf16(bfloat16x8_t a)   { return vreinterpretq_u8_bf16(a);   }
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t  test_vreinterpretq_u16_bf16(bfloat16x8_t a)  { return vreinterpretq_u16_bf16(a);  }
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t  test_vreinterpretq_u32_bf16(bfloat16x8_t a)  { return vreinterpretq_u32_bf16(a);  }
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t  test_vreinterpretq_p8_bf16(bfloat16x8_t a)   { return vreinterpretq_p8_bf16(a);   }
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t  test_vreinterpretq_p16_bf16(bfloat16x8_t a)  { return vreinterpretq_p16_bf16(a);  }
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t  test_vreinterpretq_u64_bf16(bfloat16x8_t a)  { return vreinterpretq_u64_bf16(a);  }
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t   test_vreinterpretq_s64_bf16(bfloat16x8_t a)  { return vreinterpretq_s64_bf16(a);  }
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t  test_vreinterpretq_p64_bf16(bfloat16x8_t a)  { return vreinterpretq_p64_bf16(a);  }
 // CHECK-LABEL: define dso_local i128 @test_vreinterpretq_p128_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t   test_vreinterpretq_p128_bf16(bfloat16x8_t a) { return vreinterpretq_p128_bf16(a); }
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_bf16(
 // CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_bf16(bfloat16x4_t a)   { return vreinterpret_f64_bf16(a);   }
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_bf16(
 // CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_bf16(bfloat16x8_t a)  { return vreinterpretq_f64_bf16(a);  }
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 3cfa0ac3aa6b1c..b4a4048595696d 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -1374,11 +1374,11 @@ uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vbsl_f32(
 // CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i32> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
 // CHECK-NEXT:    ret <2 x float> [[TMP1]]
@@ -1390,11 +1390,11 @@ float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vbsl_f64(
 // CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]], <1 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <1 x double> [[V2]] to <1 x i64>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <1 x double> [[V3]] to <1 x i64>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[V2]] to <1 x i64>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <1 x double> [[V3]] to <1 x i64>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <1 x i64> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
 // CHECK-NEXT:    ret <1 x double> [[TMP1]]
@@ -1536,11 +1536,11 @@ uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vbslq_f32(
 // CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i32> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
 // CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -1578,11 +1578,11 @@ poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vbslq_f64(
 // CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <2 x i64> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
 // CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -17980,8 +17980,8 @@ uint64_t test_vcvtd_n_u64_f64(float64_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
@@ -17990,8 +17990,8 @@ int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
@@ -18000,8 +18000,8 @@ int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
@@ -18019,8 +18019,8 @@ int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
@@ -18029,8 +18029,8 @@ int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
@@ -18039,8 +18039,8 @@ int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
@@ -18049,8 +18049,8 @@ int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
@@ -18059,8 +18059,8 @@ int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
@@ -18069,8 +18069,8 @@ int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
@@ -18088,8 +18088,8 @@ int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
@@ -18098,8 +18098,8 @@ int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
@@ -18108,8 +18108,8 @@ int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
@@ -18118,8 +18118,8 @@ int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
@@ -18128,8 +18128,8 @@ int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
@@ -18138,8 +18138,8 @@ int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
@@ -18157,8 +18157,8 @@ int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
@@ -18167,8 +18167,8 @@ int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
@@ -18177,8 +18177,8 @@ int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
@@ -18187,8 +18187,8 @@ int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
@@ -18197,8 +18197,8 @@ int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
@@ -18207,8 +18207,8 @@ int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
@@ -18226,8 +18226,8 @@ int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
@@ -18236,8 +18236,8 @@ int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
@@ -18246,8 +18246,8 @@ int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
@@ -18256,8 +18256,8 @@ int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
@@ -18266,8 +18266,8 @@ int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
@@ -18276,8 +18276,8 @@ int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
@@ -18295,8 +18295,8 @@ int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
@@ -18305,8 +18305,8 @@ int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
@@ -18315,8 +18315,8 @@ int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
@@ -18325,8 +18325,8 @@ int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
@@ -18335,8 +18335,8 @@ int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
@@ -18345,8 +18345,8 @@ int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
@@ -18355,8 +18355,8 @@ int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
@@ -18365,8 +18365,8 @@ int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
@@ -18375,8 +18375,8 @@ int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
@@ -18385,8 +18385,8 @@ int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
@@ -18395,8 +18395,8 @@ int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
@@ -18405,8 +18405,8 @@ int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
@@ -18415,8 +18415,8 @@ int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
@@ -18434,8 +18434,8 @@ int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
@@ -18444,8 +18444,8 @@ int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
@@ -18454,8 +18454,8 @@ int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
@@ -18464,8 +18464,8 @@ int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
@@ -18474,8 +18474,8 @@ int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
@@ -18502,8 +18502,8 @@ uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
@@ -18512,8 +18512,8 @@ uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
@@ -18522,8 +18522,8 @@ uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
@@ -18532,8 +18532,8 @@ uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
@@ -18542,8 +18542,8 @@ uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
@@ -18552,8 +18552,8 @@ uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
@@ -18562,8 +18562,8 @@ uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
@@ -18572,8 +18572,8 @@ uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
@@ -18582,8 +18582,8 @@ uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
@@ -18601,8 +18601,8 @@ uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
@@ -18611,8 +18611,8 @@ uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
@@ -18621,8 +18621,8 @@ uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
@@ -18640,8 +18640,8 @@ uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
@@ -18650,8 +18650,8 @@ uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
@@ -18660,8 +18660,8 @@ uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
@@ -18670,8 +18670,8 @@ uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
@@ -18680,8 +18680,8 @@ uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
@@ -18690,8 +18690,8 @@ uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
@@ -18700,8 +18700,8 @@ uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
@@ -18710,8 +18710,8 @@ uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
@@ -18720,8 +18720,8 @@ uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
@@ -18739,8 +18739,8 @@ uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
@@ -18749,8 +18749,8 @@ uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
@@ -18759,8 +18759,8 @@ uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
@@ -18778,8 +18778,8 @@ uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
@@ -18788,8 +18788,8 @@ uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
@@ -18798,8 +18798,8 @@ uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
@@ -18808,8 +18808,8 @@ uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
@@ -18818,8 +18818,8 @@ uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
@@ -18828,8 +18828,8 @@ uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
@@ -18838,8 +18838,8 @@ uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
@@ -18848,8 +18848,8 @@ uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
@@ -18858,8 +18858,8 @@ uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
@@ -18868,8 +18868,8 @@ uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[DOTCAST]]
 //
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
@@ -18878,8 +18878,8 @@ uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
@@ -18888,8 +18888,8 @@ uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
@@ -18898,8 +18898,8 @@ uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
@@ -18917,8 +18917,8 @@ uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
@@ -18927,8 +18927,8 @@ uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
@@ -18937,8 +18937,8 @@ uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
@@ -18947,8 +18947,8 @@ uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
@@ -18957,8 +18957,8 @@ uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
@@ -18967,8 +18967,8 @@ uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
@@ -18977,8 +18977,8 @@ uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
@@ -18987,8 +18987,8 @@ uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
@@ -19006,8 +19006,8 @@ uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
@@ -19016,8 +19016,8 @@ float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
@@ -19026,8 +19026,8 @@ float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
@@ -19036,8 +19036,8 @@ float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
@@ -19046,8 +19046,8 @@ float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
@@ -19056,8 +19056,8 @@ float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
@@ -19066,8 +19066,8 @@ float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
@@ -19076,8 +19076,8 @@ float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
@@ -19086,8 +19086,8 @@ float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
@@ -19096,8 +19096,8 @@ float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
@@ -19106,8 +19106,8 @@ float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
@@ -19116,8 +19116,8 @@ float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
@@ -19126,8 +19126,8 @@ float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
-// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[DOTCAST]]
 //
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
@@ -19136,8 +19136,8 @@ float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
@@ -19146,8 +19146,8 @@ float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
@@ -19156,8 +19156,8 @@ float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
@@ -19166,8 +19166,8 @@ float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
@@ -19176,8 +19176,8 @@ float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
@@ -19186,8 +19186,8 @@ float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
@@ -19196,8 +19196,8 @@ float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
@@ -19206,8 +19206,8 @@ float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
@@ -19216,8 +19216,8 @@ float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
@@ -19226,8 +19226,8 @@ float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
@@ -19236,8 +19236,8 @@ float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
@@ -19246,8 +19246,8 @@ float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
@@ -19256,8 +19256,8 @@ float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
-// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[DOTCAST]]
 //
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
@@ -19266,8 +19266,8 @@ float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
@@ -19276,8 +19276,8 @@ float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
@@ -19286,8 +19286,8 @@ float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
@@ -19296,8 +19296,8 @@ float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
@@ -19306,8 +19306,8 @@ float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
@@ -19316,8 +19316,8 @@ float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
@@ -19326,8 +19326,8 @@ float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
@@ -19336,8 +19336,8 @@ float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
@@ -19346,8 +19346,8 @@ float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
@@ -19356,8 +19356,8 @@ float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
@@ -19366,8 +19366,8 @@ float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
@@ -19376,8 +19376,8 @@ float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
@@ -19386,8 +19386,8 @@ float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[DOTCAST]]
 //
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
@@ -19405,8 +19405,8 @@ poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
@@ -19415,8 +19415,8 @@ poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
@@ -19425,8 +19425,8 @@ poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
@@ -19444,8 +19444,8 @@ poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
@@ -19454,8 +19454,8 @@ poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
@@ -19464,8 +19464,8 @@ poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
@@ -19474,8 +19474,8 @@ poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
@@ -19484,8 +19484,8 @@ poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
@@ -19494,8 +19494,8 @@ poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
@@ -19504,8 +19504,8 @@ poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
@@ -19514,8 +19514,8 @@ poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[DOTCAST]]
 //
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
@@ -19524,8 +19524,8 @@ poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
@@ -19543,8 +19543,8 @@ poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
@@ -19553,8 +19553,8 @@ poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
@@ -19563,8 +19563,8 @@ poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
@@ -19582,8 +19582,8 @@ poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
@@ -19592,8 +19592,8 @@ poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
@@ -19602,8 +19602,8 @@ poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
@@ -19612,8 +19612,8 @@ poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
@@ -19622,8 +19622,8 @@ poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
@@ -19632,8 +19632,8 @@ poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
@@ -19642,8 +19642,8 @@ poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[DOTCAST]]
 //
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
@@ -19652,8 +19652,8 @@ poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
@@ -19662,8 +19662,8 @@ poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
@@ -19672,8 +19672,8 @@ poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
@@ -19691,8 +19691,8 @@ poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
@@ -19701,8 +19701,8 @@ poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
@@ -19711,8 +19711,8 @@ poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
@@ -19730,8 +19730,8 @@ poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
@@ -19740,8 +19740,8 @@ poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
@@ -19750,8 +19750,8 @@ poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <1 x double> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
@@ -19760,8 +19760,8 @@ poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
@@ -19770,8 +19770,8 @@ poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
 // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[DOTCAST]]
 //
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
@@ -19780,8 +19780,8 @@ poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
@@ -19790,8 +19790,8 @@ int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
@@ -19800,8 +19800,8 @@ int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
@@ -19819,8 +19819,8 @@ int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
@@ -19829,8 +19829,8 @@ int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
@@ -19839,8 +19839,8 @@ int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
@@ -19849,8 +19849,8 @@ int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
@@ -19859,8 +19859,8 @@ int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
@@ -19869,8 +19869,8 @@ int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
@@ -19888,8 +19888,8 @@ int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
@@ -19898,8 +19898,8 @@ int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
@@ -19908,8 +19908,8 @@ int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
@@ -19918,8 +19918,8 @@ int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
@@ -19928,8 +19928,8 @@ int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
@@ -19938,8 +19938,8 @@ int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
@@ -19957,8 +19957,8 @@ int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
@@ -19967,8 +19967,8 @@ int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
@@ -19977,8 +19977,8 @@ int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
@@ -19987,8 +19987,8 @@ int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
@@ -19997,8 +19997,8 @@ int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
@@ -20007,8 +20007,8 @@ int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
@@ -20026,8 +20026,8 @@ int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
@@ -20036,8 +20036,8 @@ int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
@@ -20046,8 +20046,8 @@ int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
@@ -20056,8 +20056,8 @@ int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
@@ -20066,8 +20066,8 @@ int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
@@ -20076,8 +20076,8 @@ int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
@@ -20095,8 +20095,8 @@ int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
@@ -20105,8 +20105,8 @@ int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
@@ -20115,8 +20115,8 @@ int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
@@ -20125,8 +20125,8 @@ int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
@@ -20135,8 +20135,8 @@ int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
@@ -20145,8 +20145,8 @@ int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
@@ -20155,8 +20155,8 @@ int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
@@ -20165,8 +20165,8 @@ int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
@@ -20175,8 +20175,8 @@ int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
@@ -20185,8 +20185,8 @@ int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
@@ -20195,8 +20195,8 @@ int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
@@ -20205,8 +20205,8 @@ int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
@@ -20215,8 +20215,8 @@ int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
@@ -20234,8 +20234,8 @@ int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
@@ -20244,8 +20244,8 @@ int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
@@ -20254,8 +20254,8 @@ int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
@@ -20264,8 +20264,8 @@ int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
@@ -20274,8 +20274,8 @@ int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
@@ -20302,8 +20302,8 @@ uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
@@ -20312,8 +20312,8 @@ uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
@@ -20322,8 +20322,8 @@ uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
@@ -20332,8 +20332,8 @@ uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
@@ -20342,8 +20342,8 @@ uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
@@ -20352,8 +20352,8 @@ uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
@@ -20362,8 +20362,8 @@ uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
@@ -20372,8 +20372,8 @@ uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
@@ -20382,8 +20382,8 @@ uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
@@ -20401,8 +20401,8 @@ uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
@@ -20411,8 +20411,8 @@ uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
@@ -20421,8 +20421,8 @@ uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
@@ -20440,8 +20440,8 @@ uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
@@ -20450,8 +20450,8 @@ uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
@@ -20460,8 +20460,8 @@ uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
@@ -20470,8 +20470,8 @@ uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
@@ -20480,8 +20480,8 @@ uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
@@ -20490,8 +20490,8 @@ uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
@@ -20500,8 +20500,8 @@ uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
@@ -20510,8 +20510,8 @@ uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
@@ -20520,8 +20520,8 @@ uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
@@ -20539,8 +20539,8 @@ uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
@@ -20549,8 +20549,8 @@ uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
@@ -20559,8 +20559,8 @@ uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
@@ -20578,8 +20578,8 @@ uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
@@ -20588,8 +20588,8 @@ uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
@@ -20598,8 +20598,8 @@ uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
@@ -20608,8 +20608,8 @@ uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
@@ -20618,8 +20618,8 @@ uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
@@ -20628,8 +20628,8 @@ uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
@@ -20638,8 +20638,8 @@ uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
@@ -20648,8 +20648,8 @@ uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
@@ -20658,8 +20658,8 @@ uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
@@ -20668,8 +20668,8 @@ uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
@@ -20678,8 +20678,8 @@ uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
@@ -20688,8 +20688,8 @@ uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
@@ -20698,8 +20698,8 @@ uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
@@ -20717,8 +20717,8 @@ uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
@@ -20727,8 +20727,8 @@ uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
@@ -20737,8 +20737,8 @@ uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
@@ -20747,8 +20747,8 @@ uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
@@ -20757,8 +20757,8 @@ uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
@@ -20767,8 +20767,8 @@ uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
@@ -20777,8 +20777,8 @@ uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
@@ -20787,8 +20787,8 @@ uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
@@ -20806,8 +20806,8 @@ uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
@@ -20816,8 +20816,8 @@ float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
@@ -20826,8 +20826,8 @@ float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
@@ -20836,8 +20836,8 @@ float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
@@ -20846,8 +20846,8 @@ float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
@@ -20856,8 +20856,8 @@ float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
@@ -20866,8 +20866,8 @@ float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
@@ -20876,8 +20876,8 @@ float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
@@ -20886,8 +20886,8 @@ float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
@@ -20896,8 +20896,8 @@ float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
@@ -20906,8 +20906,8 @@ float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
@@ -20916,8 +20916,8 @@ float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
@@ -20926,8 +20926,8 @@ float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
-// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[DOTCAST]]
 //
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
@@ -20936,8 +20936,8 @@ float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
@@ -20946,8 +20946,8 @@ float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
@@ -20956,8 +20956,8 @@ float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
@@ -20966,8 +20966,8 @@ float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
@@ -20976,8 +20976,8 @@ float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
@@ -20986,8 +20986,8 @@ float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
@@ -20996,8 +20996,8 @@ float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
@@ -21006,8 +21006,8 @@ float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
@@ -21016,8 +21016,8 @@ float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
@@ -21026,8 +21026,8 @@ float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
@@ -21036,8 +21036,8 @@ float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
@@ -21046,8 +21046,8 @@ float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
@@ -21056,8 +21056,8 @@ float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
@@ -21066,8 +21066,8 @@ float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
@@ -21076,8 +21076,8 @@ float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
@@ -21086,8 +21086,8 @@ float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
@@ -21096,8 +21096,8 @@ float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
@@ -21106,8 +21106,8 @@ float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
@@ -21116,8 +21116,8 @@ float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
@@ -21126,8 +21126,8 @@ float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
@@ -21136,8 +21136,8 @@ float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
@@ -21146,8 +21146,8 @@ float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
@@ -21156,8 +21156,8 @@ float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
@@ -21166,8 +21166,8 @@ float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
@@ -21176,8 +21176,8 @@ float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
@@ -21186,8 +21186,8 @@ float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
@@ -21205,8 +21205,8 @@ poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
@@ -21215,8 +21215,8 @@ poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
@@ -21225,8 +21225,8 @@ poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
@@ -21244,8 +21244,8 @@ poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
@@ -21254,8 +21254,8 @@ poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
@@ -21264,8 +21264,8 @@ poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
@@ -21274,8 +21274,8 @@ poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
@@ -21284,8 +21284,8 @@ poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
@@ -21294,8 +21294,8 @@ poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
@@ -21304,8 +21304,8 @@ poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
@@ -21314,8 +21314,8 @@ poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
@@ -21324,8 +21324,8 @@ poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
@@ -21343,8 +21343,8 @@ poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
@@ -21353,8 +21353,8 @@ poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
@@ -21363,8 +21363,8 @@ poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
@@ -21382,8 +21382,8 @@ poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
@@ -21392,8 +21392,8 @@ poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
@@ -21402,8 +21402,8 @@ poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
@@ -21412,8 +21412,8 @@ poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
@@ -21422,8 +21422,8 @@ poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
@@ -21432,8 +21432,8 @@ poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
@@ -21442,8 +21442,8 @@ poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p64(
 // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
@@ -21452,8 +21452,8 @@ poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
@@ -21462,8 +21462,8 @@ poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
@@ -21472,8 +21472,8 @@ poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
@@ -21491,8 +21491,8 @@ poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
@@ -21501,8 +21501,8 @@ poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
@@ -21511,8 +21511,8 @@ poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u32(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
@@ -21530,8 +21530,8 @@ poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f16(
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
@@ -21540,8 +21540,8 @@ poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f32(
 // CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
@@ -21550,8 +21550,8 @@ poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f64(
 // CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
@@ -21560,8 +21560,8 @@ poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p8(
 // CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
@@ -21570,8 +21570,8 @@ poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
 // CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p16(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c
index c840e4753ab6bc..8bdd2501e9a10d 100644
--- a/clang/test/CodeGen/AArch64/neon-misc.c
+++ b/clang/test/CodeGen/AArch64/neon-misc.c
@@ -2834,8 +2834,8 @@ float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
 // CHECK-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VCVT_F32_F16_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[DOTCAST]])
 // CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I]]
 //
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
@@ -2846,8 +2846,8 @@ float32x4_t test_vcvt_f32_f16(float16x4_t a) {
 // CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    [[VCVT_F32_F16_I_I:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16>
-// CHECK-NEXT:    [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]])
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[DOTCAST]])
 // CHECK-NEXT:    ret <4 x float> [[VCVT_F32_F161_I_I]]
 //
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
diff --git a/clang/test/CodeGen/AArch64/poly128.c b/clang/test/CodeGen/AArch64/poly128.c
index be683b17e0d0b8..a03bb3edadb10c 100644
--- a/clang/test/CodeGen/AArch64/poly128.c
+++ b/clang/test/CodeGen/AArch64/poly128.c
@@ -76,8 +76,8 @@ __attribute__((target("aes"))) poly128_t test_vmull_high_p64(poly64x2_t a, poly6
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
@@ -86,8 +86,8 @@ poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
@@ -96,8 +96,8 @@ poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
@@ -106,8 +106,8 @@ poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s64
 // CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
@@ -116,8 +116,8 @@ poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
@@ -126,8 +126,8 @@ poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
@@ -136,8 +136,8 @@ poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x i32> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
@@ -146,8 +146,8 @@ poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u64
 // CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
@@ -156,8 +156,8 @@ poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x float> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
@@ -166,8 +166,8 @@ poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f64
 // CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x double> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
@@ -176,8 +176,8 @@ poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
@@ -186,8 +186,8 @@ poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
@@ -196,8 +196,8 @@ poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p64
 // CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
-// CHECK-NEXT:    ret i128 [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[DOTCAST]]
 //
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
@@ -206,8 +206,8 @@ poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s8_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
@@ -216,8 +216,8 @@ int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s16_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
@@ -226,8 +226,8 @@ int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s32_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
@@ -236,8 +236,8 @@ int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s64_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
@@ -246,8 +246,8 @@ int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u8_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
@@ -256,8 +256,8 @@ uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u16_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
@@ -266,8 +266,8 @@ uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u32_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[DOTCAST]]
 //
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
@@ -276,8 +276,8 @@ uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u64_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
@@ -286,8 +286,8 @@ uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f32_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x float>
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[DOTCAST]]
 //
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
@@ -296,8 +296,8 @@ float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f64_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x double>
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[DOTCAST]]
 //
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
@@ -306,8 +306,8 @@ float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p8_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[DOTCAST]]
 //
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
@@ -316,8 +316,8 @@ poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p16_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[DOTCAST]]
 //
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
@@ -326,8 +326,8 @@ poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p64_p128
 // CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[DOTCAST]]
 //
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
index b616664e395c05..2723e8dc28c1f0 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c
@@ -15,11 +15,11 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i16> [[A]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
 // CHECK-NEXT:    ret <4 x half> [[TMP1]]
@@ -31,11 +31,11 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[DOTCAST]]
 // CHECK-NEXT:    [[TMP0:%.*]] = xor <8 x i16> [[A]], splat (i16 -1)
-// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP0]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP0]], [[DOTCAST1]]
 // CHECK-NEXT:    [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
 // CHECK-NEXT:    ret <8 x half> [[TMP1]]
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
index b357042843162c..2ae8c0034f1c18 100644
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -34,8 +34,8 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -46,8 +46,8 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -58,8 +58,8 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -70,8 +70,8 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
+// CHECK-NEXT:    [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 6177e01887c469..d6ac68fcf9e7f5 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -2825,8 +2825,8 @@ int16x4_t test_vcreate_imm(void) {
 // CHECK-LABEL: define <4 x i16> @test_vcreate_s16(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 int16x4_t test_vcreate_s16(uint64_t a) {
@@ -2836,8 +2836,8 @@ int16x4_t test_vcreate_s16(uint64_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcreate_s32(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = bitcast i64 [[A]] to <2 x i32>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 int32x2_t test_vcreate_s32(uint64_t a) {
@@ -2878,8 +2878,8 @@ int8x8_t test_vcreate_u8(uint64_t a) {
 // CHECK-LABEL: define <4 x i16> @test_vcreate_u16(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = bitcast i64 [[A]] to <4 x i16>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false)
 // CHECK-NEXT:    ret <4 x i16> [[VCLZ_V1_I]]
 //
 int16x4_t test_vcreate_u16(uint64_t a) {
@@ -2889,8 +2889,8 @@ int16x4_t test_vcreate_u16(uint64_t a) {
 // CHECK-LABEL: define <2 x i32> @test_vcreate_u32(
 // CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32>
-// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
+// CHECK-NEXT:    [[VCLZ_V_I:%.*]] = bitcast i64 [[A]] to <2 x i32>
+// CHECK-NEXT:    [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false)
 // CHECK-NEXT:    ret <2 x i32> [[VCLZ_V1_I]]
 //
 int32x2_t test_vcreate_u32(uint64_t a) {
@@ -13712,8 +13712,9 @@ int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
@@ -13722,8 +13723,9 @@ int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
@@ -13732,8 +13734,9 @@ int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
@@ -13742,8 +13745,9 @@ int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
@@ -13752,8 +13756,9 @@ int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
@@ -13762,8 +13767,9 @@ int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
@@ -13781,8 +13787,9 @@ int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
@@ -13791,8 +13798,9 @@ int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
@@ -13801,8 +13809,9 @@ int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
@@ -13811,8 +13820,9 @@ int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
@@ -14146,8 +14156,9 @@ uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
@@ -14156,8 +14167,9 @@ uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
@@ -14166,8 +14178,9 @@ uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
@@ -14185,8 +14198,9 @@ uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
@@ -14195,8 +14209,9 @@ uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
@@ -14205,8 +14220,9 @@ uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
@@ -14215,8 +14231,9 @@ uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(
 // CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
@@ -14225,8 +14242,9 @@ uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(
 // CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
@@ -14235,8 +14253,9 @@ uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
@@ -14245,8 +14264,9 @@ uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
 // CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(
 // CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64>
-// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64
+// CHECK-NEXT:    [[__REINT_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0
+// CHECK-NEXT:    ret <1 x i64> [[__REINT_I_SROA_0_0_VEC_INSERT]]
 //
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index d7d649dd2456d5..b786cbecaf1327 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -1382,14 +1382,15 @@ void Intrinsic::emitBodyAsBuiltinCall() {
     Type CastToType = T;
 
     // Check if an explicit cast is needed.
-    if (CastToType.isVector() &&
-        (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling()))) {
-      CastToType.makeInteger(8, true);
-      Arg = "(" + CastToType.str() + ")" + Arg;
-    } else if (CastToType.isVector() && LocalCK == ClassI) {
-      if (CastToType.isInteger())
-        CastToType.makeSigned();
-      Arg = "(" + CastToType.str() + ")" + Arg;
+    if (CastToType.isVector()) {
+      if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) {
+        CastToType.makeInteger(8, true);
+        Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")";
+      } else if (LocalCK == ClassI) {
+        if (CastToType.isInteger())
+          CastToType.makeSigned();
+        Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")";
+      }
     }
 
     S += Arg + ", ";
@@ -1601,11 +1602,12 @@ Intrinsic::DagEmitter::emitDagCast(const DagInit *DI, bool IsBitCast) {
       N = "reint" + utostr(++I);
     Intr.Variables[N] = Variable(R.first, N + Intr.VariablePostfix);
 
-    Intr.OS << R.first.str() << " " << Intr.Variables[N].getName() << " = "
-            << R.second << ";";
+    Intr.OS << "  " << R.first.str() << " " << Intr.Variables[N].getName()
+            << " = " << R.second << ";";
     Intr.emitNewLine();
 
-    S = "*(" + castToType.str() + " *) &" + Intr.Variables[N].getName() + "";
+    S = "__builtin_bit_cast(" + castToType.str() + ", " +
+        Intr.Variables[N].getName() + ")";
   } else {
     // Emit a normal (static) cast.
     S = "(" + castToType.str() + ")(" + R.second + ")";

>From 2692deb39488d5b4eaaf56159aa08587128319fe Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 13:09:23 +0000
Subject: [PATCH 04/11] [AArch64] Refactor implementation of FP8 types (NFC)

* The FP8 scalar type (`__mfp8`) was described as a vector type
* The FP8 vector types were described/assumed to have
  integer element type (the element type ought to be `__mfp8`),
* Add support for `m` type specifier (denoting `__mfp8`)
  in `DecodeTypeFromStr` and create SVE builtin prototypes using
  the specifier, instead of `int8_t`.
---
 clang/include/clang/AST/Type.h                |  5 +++
 .../clang/Basic/AArch64SVEACLETypes.def       | 24 +++++++++---
 clang/lib/AST/ASTContext.cpp                  | 37 +++++++++++++++----
 clang/lib/AST/ItaniumMangle.cpp               |  5 +++
 clang/lib/AST/Type.cpp                        |  4 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            | 13 +++++--
 clang/lib/CodeGen/Targets/AArch64.cpp         |  7 +++-
 clang/utils/TableGen/SveEmitter.cpp           |  4 +-
 8 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 09c98f642852fc..aa313719a65755 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
@@ -8532,6 +8533,10 @@ inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
 
+inline bool Type::isMFloat8Type() const {
+  return isSpecificBuiltinType(BuiltinType::MFloat8);
+}
+
 inline bool Type::isFloat128Type() const {
   return isSpecificBuiltinType(BuiltinType::Float128);
 }
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 063cac1f4a58ee..6b704b386536c9 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -57,6 +57,11 @@
 //  - IsBF true for vector of brain float elements.
 //===----------------------------------------------------------------------===//
 
+#ifndef SVE_SCALAR_TYPE
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \
+  SVE_TYPE(Name, Id, SingletonId)
+#endif
+
 #ifndef SVE_VECTOR_TYPE
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
@@ -72,6 +77,11 @@
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_MFLOAT
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false)
+#endif
+
 #ifndef SVE_VECTOR_TYPE_FLOAT
 #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
@@ -125,8 +135,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
-// This is a 8 bits opaque type.
-SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
@@ -148,7 +157,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
@@ -170,7 +179,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
@@ -192,7 +201,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
 
 SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
 SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
@@ -200,11 +209,13 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
-AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
+SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
+
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1)
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
 #undef SVE_VECTOR_TYPE_FLOAT
 #undef SVE_VECTOR_TYPE_INT
@@ -213,4 +224,5 @@ AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloa
 #undef SVE_OPAQUE_TYPE
 #undef AARCH64_VECTOR_TYPE_MFLOAT
 #undef AARCH64_VECTOR_TYPE
+#undef SVE_SCALAR_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a9ecb4ee9c76b2..033746f7c933f5 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2275,6 +2275,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Width = NumEls * ElBits * NF;                                              \
     Align = NumEls * ElBits;                                                   \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    Width = Bits;                                                              \
+    Align = Bits;                                                              \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
@@ -4395,15 +4400,18 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
                                ElBits, NF)                                     \
   case BuiltinType::Id:                                                        \
     return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF};
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   case BuiltinType::Id:                                                        \
     return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
 #define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \
                                    ElBits, NF)                                 \
   case BuiltinType::Id:                                                        \
-    return {getIntTypeForBitwidth(ElBits, false),                              \
-            llvm::ElementCount::getFixed(NumEls), NF};
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+    return {MFloat8Ty, llvm::ElementCount::getFixed(NumEls), NF};
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
@@ -4465,11 +4473,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
       EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->isMFloat8Type() && EltTySize == ElBits &&                         \
+      NumElts == (NumEls * NF) && NumFields == 1) {                            \
+    return SingletonId;                                                        \
+  }
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
@@ -12216,8 +12229,15 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
                                              RequiresICE, false);
     assert(!RequiresICE && "Can't require vector ICE");
 
-    // TODO: No way to make AltiVec vectors in builtins yet.
-    Type = Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    if (ElementType == Context.MFloat8Ty) {
+      assert((NumElements == 8 || NumElements == 16) &&
+             "Invalid number of elements");
+      Type = NumElements == 8 ? Context.MFloat8x8Ty : Context.MFloat8x16Ty;
+    } else {
+      // TODO: No way to make AltiVec vectors in builtins yet.
+      Type =
+          Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    }
     break;
   }
   case 'E': {
@@ -12273,6 +12293,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   case 'p':
     Type = Context.getProcessIDType();
     break;
+  case 'm':
+    Type = Context.MFloat8Ty;
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 47aa9b40dab845..9404f9fd9b151d 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3438,6 +3438,11 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     type_name = MangledName;                                                   \
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    type_name = MangledName;                                                   \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index caa0ac858a1bea..fde0746a175705 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const {
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     return true;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
-  case BuiltinType::Id:                                                        \
-    return false;
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
     default:
       return false;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 09191a4901f493..fd3327cf9acd89 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -507,13 +507,15 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:
 #define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
   case BuiltinType::Id:
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-        auto VTy =
-            llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC);
+        auto *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getLLVMContext())
+                          : ConvertType(Info.ElementType);
+        auto *VTy = llvm::VectorType::get(EltTy, Info.EC);
         switch (Info.NumVectors) {
         default:
           llvm_unreachable("Expected 1, 2, 3 or 4 vectors!");
@@ -529,6 +531,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    case BuiltinType::MFloat8:
+      return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1,
+                                   false);
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
     case BuiltinType::Id: \
       ResultType = \
@@ -650,6 +655,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
+                           : VT->getElementType()->isMFloat8Type()
+                               ? llvm::Type::getInt8Ty(getLLVMContext())
                                : ConvertType(VT->getElementType());
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 7db67ecba07c8f..7158b94a2b3d6a 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const {
 
     case BuiltinType::SChar:
     case BuiltinType::UChar:
+    case BuiltinType::MFloat8:
       return llvm::ScalableVectorType::get(
           llvm::Type::getInt8Ty(getVMContext()), 16);
 
@@ -781,8 +782,10 @@ bool AArch64ABIInfo::passAsPureScalableType(
     NPred += Info.NumVectors;
   else
     NVec += Info.NumVectors;
-  auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue());
+  llvm::Type *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getVMContext())
+                          : CGT.ConvertType(Info.ElementType);
+  auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue());
 
   if (CoerceToSeq.size() + Info.NumVectors > 12)
     return false;
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index cf7e5a1ee3e008..bea6db287c98e1 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -448,7 +448,7 @@ std::string SVEType::builtinBaseType() const {
   case TypeKind::PredicatePattern:
     return "i";
   case TypeKind::Fpm:
-    return "Wi";
+    return "UWi";
   case TypeKind::Predicate:
     return "b";
   case TypeKind::BFloat16:
@@ -456,7 +456,7 @@ std::string SVEType::builtinBaseType() const {
     return "y";
   case TypeKind::MFloat8:
     assert(ElementBitwidth == 8 && "Invalid MFloat8!");
-    return "c";
+    return "m";
   case TypeKind::Float:
     switch (ElementBitwidth) {
     case 16:

>From e845e643f5cb69265a43120d4662a1fd9c440ffa Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 15:44:58 +0000
Subject: [PATCH 05/11] [Clang][AArch64] Allow FP8 Neon vector types to be used
 by __builtin_shufflevector

The Neon vector types for FP8 (`__MFloat8x8_t` and `__MFloat8x16_t`) are
implemented as builtin types and need a special case in
`__builtin_shufflevector`.
---
 clang/include/clang/AST/Type.h                |   4 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +
 clang/lib/AST/Type.cpp                        |  13 ++
 clang/lib/Sema/SemaChecking.cpp               |  39 +++++-
 .../AArch64/builtin-shufflevector-fp8.c       | 123 ++++++++++++++++++
 clang/test/Sema/builtin-shufflevector.c       |  30 +++++
 6 files changed, 208 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
 create mode 100644 clang/test/Sema/builtin-shufflevector.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index aa313719a65755..fbc62f61ad5a55 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2404,6 +2404,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   /// SVE vector or predicate, excluding tuple types such as svint32x4_t.
   bool isSveVLSBuiltinType() const;
 
+  /// Determines if this is a *builtin* NEON vector type, a type not built with
+  /// `neon_vector_type`
+  bool isNeonVectorBuiltinType() const;
+
   /// Returns the representative type for the element of an SVE builtin type.
   /// This is used to represent fixed-length SVE vectors created with the
   /// 'arm_sve_vector_bits' type attribute as VectorType.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 03fb7ca9bc3c3b..d3cc1c606c26f3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10560,6 +10560,9 @@ def err_vec_builtin_incompatible_vector : Error<
 def err_vsx_builtin_nonconstant_argument : Error<
   "argument %0 to %1 must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)">;
 
+def err_shufflevector_incompatible_index_vector : Error<
+  "second argument for __builtin_shufflevector must be integer vector "
+  "with length equal to the length of the first argument">;
 def err_shufflevector_nonconstant_argument : Error<
   "index for __builtin_shufflevector must be a constant integer">;
 def err_shufflevector_argument_too_large : Error<
@@ -10567,6 +10570,8 @@ def err_shufflevector_argument_too_large : Error<
   "of vector elements">;
 def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
   "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">;
+def err_shufflevector_unsupported_result_vector_type : Error<
+  "unsupported vector type for the result">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index fde0746a175705..fb55bab0a67de6 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2576,6 +2576,19 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+bool Type::isNeonVectorBuiltinType() const {
+  if (const BuiltinType *BT = getAs<BuiltinType>()) {
+    switch (BT->getKind()) {
+    case BuiltinType::MFloat8x8:
+    case BuiltinType::MFloat8x16:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
 QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
   assert(isSizelessVectorType() && "Must be sizeless vector type");
   // Currently supports SVE and RVV
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ce846ae88c38b4..070e0c80c4f5ca 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5127,24 +5127,32 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
     QualType LHSType = TheCall->getArg(0)->getType();
     QualType RHSType = TheCall->getArg(1)->getType();
 
-    if (!LHSType->isVectorType() || !RHSType->isVectorType())
+    if (!LHSType->isVectorType() && !LHSType->isNeonVectorBuiltinType())
       return ExprError(
-          Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector)
-          << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+          Diag(TheCall->getBeginLoc(), diag::err_builtin_non_vector_type)
+          << "first" << TheCall->getDirectCallee()
+          << /*isMorethantwoArgs*/ false
           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                          TheCall->getArg(1)->getEndLoc()));
 
-    numElements = LHSType->castAs<VectorType>()->getNumElements();
+    if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+      assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+             Ty->getKind() == BuiltinType::MFloat8x16);
+      numElements = Ty->getKind() == BuiltinType::MFloat8x8 ? 8 : 16;
+    } else {
+      numElements = LHSType->castAs<VectorType>()->getNumElements();
+    }
+
     unsigned numResElements = TheCall->getNumArgs() - 2;
 
     // Check to see if we have a call with 2 vector arguments, the unary shuffle
     // with mask.  If so, verify that RHS is an integer vector type with the
     // same number of elts as lhs.
     if (TheCall->getNumArgs() == 2) {
-      if (!RHSType->hasIntegerRepresentation() ||
+      if (!RHSType->isVectorType() || !RHSType->hasIntegerRepresentation() ||
           RHSType->castAs<VectorType>()->getNumElements() != numElements)
         return ExprError(Diag(TheCall->getBeginLoc(),
-                              diag::err_vec_builtin_incompatible_vector)
+                              diag::err_shufflevector_incompatible_index_vector)
                          << TheCall->getDirectCallee()
                          << /*isMorethantwoArgs*/ false
                          << SourceRange(TheCall->getArg(1)->getBeginLoc(),
@@ -5157,6 +5165,25 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
                        << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                                       TheCall->getArg(1)->getEndLoc()));
     } else if (numElements != numResElements) {
+      if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+        assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+               Ty->getKind() == BuiltinType::MFloat8x16);
+        switch (numResElements) {
+        case 8:
+          resType = Context.MFloat8x8Ty;
+          break;
+        case 16:
+          resType = Context.MFloat8x16Ty;
+          break;
+        default:
+          return ExprError(Diag(TheCall->getBeginLoc(),
+                                diag::err_shufflevector_unsupported_result_vector_type)
+                           << TheCall->getDirectCallee()
+                           << /*isMorethantwoArgs*/ false
+                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                                          TheCall->getArg(1)->getEndLoc()));
+        }
+      }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();
       resType =
           Context.getVectorType(eltType, numResElements, VectorKind::Generic);
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
new file mode 100644
index 00000000000000..45ea8127509537
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -0,0 +1,123 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+// CHECK-LABEL: define dso_local <8 x i8> @f0(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
+//
+mfloat8x8_t f0(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @f1(
+// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
+//
+mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) {
+  return __builtin_shufflevector(x, p);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f3(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
+//
+mfloat8x16_t f3(mfloat8x16_t x) {
+  return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2,
+                                 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f4(
+// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8
+// CHECK-NEXT:    [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]]
+// CHECK-NEXT:    [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8
+// CHECK-NEXT:    [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9
+// CHECK-NEXT:    [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]]
+// CHECK-NEXT:    [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9
+// CHECK-NEXT:    [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10
+// CHECK-NEXT:    [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]]
+// CHECK-NEXT:    [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10
+// CHECK-NEXT:    [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11
+// CHECK-NEXT:    [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]]
+// CHECK-NEXT:    [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11
+// CHECK-NEXT:    [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12
+// CHECK-NEXT:    [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]]
+// CHECK-NEXT:    [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12
+// CHECK-NEXT:    [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13
+// CHECK-NEXT:    [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]]
+// CHECK-NEXT:    [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13
+// CHECK-NEXT:    [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14
+// CHECK-NEXT:    [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]]
+// CHECK-NEXT:    [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14
+// CHECK-NEXT:    [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15
+// CHECK-NEXT:    [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]]
+// CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
+//
+mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+}
diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c
new file mode 100644
index 00000000000000..c2dabb9d6585a2
--- /dev/null
+++ b/clang/test/Sema/builtin-shufflevector.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple aarch64 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+int8x8_t non_vector(int x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+  // expected-error at -1 {{first argument to '__builtin_shufflevector' must be of vector type}}
+}
+
+mfloat8x8_t unsuported_vector(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0, 0);
+  // expected-error at -1 {{unsupported vector type for the result}}
+}
+
+int8x8_t non_vector_index(int8x8_t x, int p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+
+int8x8_t bad_vector_index_length(int8x8_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+

>From dbfe4c9ed5cceb5936b2353ec65ddb5ac44e40fc Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 16 Dec 2024 09:58:22 +0000
Subject: [PATCH 06/11] [fixup] Fix formatting (NFC)

---
 clang/lib/Sema/SemaChecking.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 070e0c80c4f5ca..dadc5b8d4d13be 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5176,12 +5176,12 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
           resType = Context.MFloat8x16Ty;
           break;
         default:
-          return ExprError(Diag(TheCall->getBeginLoc(),
-                                diag::err_shufflevector_unsupported_result_vector_type)
-                           << TheCall->getDirectCallee()
-                           << /*isMorethantwoArgs*/ false
-                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                                          TheCall->getArg(1)->getEndLoc()));
+          return ExprError(
+              Diag(TheCall->getBeginLoc(),
+                   diag::err_shufflevector_unsupported_result_vector_type)
+              << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+              << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                             TheCall->getArg(1)->getEndLoc()));
         }
       }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();

>From ed401d77db00e9245e01b20572004ecbe05b5647 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 19:24:16 +0000
Subject: [PATCH 07/11] [AArch64] Add Neon FP8 conversion intrinsics

---
 clang/include/clang/Basic/arm_neon.td         |  25 +++-
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 120 +++++++++++++++++-
 clang/lib/CodeGen/CodeGenFunction.h           |   3 +
 clang/utils/TableGen/NeonEmitter.cpp          |  24 +++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  22 ++++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  46 +++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 8 files changed, 232 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 19cf6f1dbfb692..cf1f8af513bf66 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,6 +2125,29 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+  def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Qm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+
+  def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "m">;
+  def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
@@ -2134,4 +2157,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   // fscale
   def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
   def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
-}
\ No newline at end of file
+}
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index fd800e5a6278e4..91a2bf3020b9a3 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation {
 // B: change to BFloat16
 // P: change to polynomial category.
 // p: change polynomial to equivalent integer category. Otherwise nop.
+// V: change to fpm_t
 //
 // >: double element width (vector size unchanged).
 // <: half element width (vector size unchanged).
@@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b3c76986511444..0261ae66dd0581 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6908,6 +6908,13 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+                                        SmallVectorImpl<Value *> &Ops,
+                                        Value *FPM, const char *name) {
+  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+  return EmitNeonCall(F, Ops, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -14081,7 +14088,118 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    Tys[1] = Ops[1]->getType();
+    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
+                                        Ops[0], Builder.getInt64(0));
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+  }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index d1bec166a435e2..df4b25e015efd4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4663,6 +4663,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+                               SmallVectorImpl<llvm::Value *> &O,
+                               llvm::Value *FPM, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index b786cbecaf1327..fd971a4a532a8d 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -74,6 +74,7 @@ enum ClassKind {
   ClassI,     // generic integer instruction, e.g., "i8" suffix
   ClassS,     // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
   ClassW,     // width-specific instruction, e.g., "8" suffix
+  ClassV,     // void-suffix instruction, no suffix
   ClassB,     // bitcast arguments with enum argument to specify type
   ClassL,     // Logical instructions which are op instructions
               // but we need to not emit any suffix for in our
@@ -144,7 +145,7 @@ class Type {
 private:
   TypeSpec TS;
 
-  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 };
+  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM };
   TypeKind Kind;
   bool Immediate, Constant, Pointer;
   // ScalarForMangling and NoManglingQ are really not suited to live here as
@@ -198,6 +199,7 @@ class Type {
   bool isVoid() const { return Kind == Void; }
   bool isBFloat16() const { return Kind == BFloat16; }
   bool isMFloat8() const { return Kind == MFloat8; }
+  bool isFPM() const { return Kind == FPM; }
   unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
   unsigned getSizeInBits() const { return Bitwidth; }
   unsigned getElementSizeInBits() const { return ElementBitwidth; }
@@ -600,6 +602,7 @@ class NeonEmitter {
     const Record *SI = R.getClass("SInst");
     const Record *II = R.getClass("IInst");
     const Record *WI = R.getClass("WInst");
+    const Record *VI = R.getClass("VInst");
     const Record *SOpI = R.getClass("SOpInst");
     const Record *IOpI = R.getClass("IOpInst");
     const Record *WOpI = R.getClass("WOpInst");
@@ -609,6 +612,7 @@ class NeonEmitter {
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
     ClassMap[WI] = ClassW;
+    ClassMap[VI] = ClassV;
     ClassMap[SOpI] = ClassS;
     ClassMap[IOpI] = ClassI;
     ClassMap[WOpI] = ClassW;
@@ -641,6 +645,9 @@ class NeonEmitter {
 std::string Type::str() const {
   if (isVoid())
     return "void";
+  if (isFPM())
+    return "fpm_t";
+
   std::string S;
 
   if (isInteger() && !isSigned())
@@ -699,6 +706,8 @@ std::string Type::builtin_str() const {
   } else if (isMFloat8()) {
     assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits");
     S += "m";
+  } else if (isFPM()) {
+    S += "UWi";
   } else
     switch (ElementBitwidth) {
     case 16: S += "h"; break;
@@ -888,6 +897,7 @@ void Type::applyTypespec(bool &Quad) {
     case 'm':
       Kind = MFloat8;
       ElementBitwidth = 8;
+      NoManglingQ = true;
       break;
     default:
       llvm_unreachable("Unhandled type code!");
@@ -925,6 +935,13 @@ void Type::applyModifiers(StringRef Mods) {
     case 'P':
       Kind = Poly;
       break;
+    case 'V':
+      Kind = FPM;
+      Bitwidth = ElementBitwidth = 64;
+      NumVectors = 0;
+      Immediate = Constant = Pointer = false;
+      ScalarForMangling = NoManglingQ = true;
+      break;
     case '>':
       assert(ElementBitwidth < 128);
       ElementBitwidth *= 2;
@@ -1000,6 +1017,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   if (CK == ClassB && TargetGuard == "neon")
     return "";
 
+  if (this->CK == ClassV)
+    return "";
+
   if (T.isBFloat16())
     return "bf16";
 
@@ -1349,7 +1369,7 @@ void Intrinsic::emitBodyAsBuiltinCall() {
   if (!protoHasScalar())
     LocalCK = ClassB;
 
-  if (!getReturnType().isVoid() && !SRet)
+  if (!getReturnType().isVoid() && !SRet && !getReturnType().isMFloat8())
     S += "(" + RetVar.getType().str() + ") ";
 
   S += "__builtin_neon_" + mangleName(std::string(N), LocalCK) + "(";
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a66099a92bda..c135a95e938a34 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1004,6 +1004,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 
+  //
+  // Neon FP8 intrinsics
+  //
+
+  // Conversions
+  class AdvSIMD_FP8_1VectorArg_Long_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_cvtl1   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_fp8_cvtl2   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+
+  def int_aarch64_neon_fp8_fcvtn
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
+  def int_aarch64_neon_fp8_fcvtn2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 47c4c6c39565f4..f65b828a91aac2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6551,17 +6551,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
 
 
 // FCVTN (FP16 to FP8)
-multiclass SIMDThreeSameSizeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
-   def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+    def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+  }
+  def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>;
+  def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>;
 }
 
-// TODO : Create v16f8 value type
 // FCVTN, FCVTN2 (FP32 to FP8)
-multiclass SIMDThreeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
-   def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
-                                           V128, v16i8, v4f32, null_frag>;
+multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+    def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+                                            V128, v16i8, v4f32, null_frag>;
+  }
+
+  def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>;
+
+  def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
 // TODO: Create a new Value Type v8f8 and v16f8
@@ -7025,11 +7038,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD two-register miscellaneous
 //----------------------------------------------------------------------------
-multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
-  def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
-                                     asm, ".8h", ".8b", []>;
-  def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
-                                     asm#2, ".8h", ".16b", []>;
+multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> {
+  let Uses=[FPMR, FPCR], mayLoad = 1 in {
+    def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+                                      asm, ".8h", ".8b", []>;
+    def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+                                        asm#2, ".8h", ".16b", []>;
+  }
+  def : Pat<(dty (Op (v8i8 V64:$Rn))),
+            (!cast<Instruction>(NAME) V64:$Rn)>;
+
+  def : Pat<(dty (Op (v16i8 V128:$Rn))),
+            (!cast<Instruction>(NAME#2) V128:$Rn)>;
 }
 
 class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ec891ea4bac85e..3050348c484188 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10319,13 +10319,13 @@ let Predicates = [HasD128] in {
 // 2023 Architecture Extensions:
 //===----------------------------===//
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
-  defm F1CVTL  : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
-  defm F2CVTL  : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
-  defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
-  defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
-  defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
-  defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+let Predicates = [HasFP8] in {
+  defm F1CVTL  : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+  defm F2CVTL  : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+  defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+  defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+  defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+  defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
   defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 

>From af18c3f4b4d901ee5290862d2c74cecdb07f7883 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 10 Dec 2024 18:14:02 +0000
Subject: [PATCH 08/11] [fixup] Add tests, fix calling the wrong LLVM intrinsic

---
 clang/lib/CodeGen/CGBuiltin.cpp               |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_cvt.c        | 308 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll     | 112 +++++++
 3 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
 create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0261ae66dd0581..c7de2026abb44c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14132,7 +14132,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
@@ -14150,7 +14150,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..7543938f487103
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,308 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f32_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
+// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
+                                    float32x4_t vm, fpm_t fpm) {
+  return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f16_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {
+  return vcvtq_mf8_f16_fpm(vn, vm, fpm);
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
new file mode 100644
index 00000000000000..6070380d24234b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+
+define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+   %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_low_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_high_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn2 v0.16b, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm)
+  ret <16 x i8> %res
+}
+
+
+define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) {
+; CHECK-LABEL: test_vcvtn_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) {
+; CHECK-LABEL: test_vcvtn2_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.16b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm)
+  ret <16 x i8> %res
+}

>From f8ee48b3b62fb54e3104dc06a6c6f52f295f39b4 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 10:49:19 +0000
Subject: [PATCH 09/11] [fixup] Refector much of common code into a helper
 function (NFC)

---
 clang/lib/CodeGen/CGBuiltin.cpp     | 171 +++++++++++-----------------
 clang/lib/CodeGen/CodeGenFunction.h |   4 +
 2 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c7de2026abb44c..7497a779cae755 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6921,6 +6921,23 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
   return ConstantInt::get(Ty, neg ? -SV : SV);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                           llvm::Type *Ty1, bool Extract,
+                                           SmallVectorImpl<llvm::Value *> &Ops,
+                                           const CallExpr *E,
+                                           const char *name) {
+  llvm::Type *Tys[] = {Ty0, Ty1};
+  if (Extract) {
+    // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 // Right-shift a vector by a constant.
 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
                                           llvm::Type *Ty, bool usgn,
@@ -12874,6 +12891,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return V;
 
   unsigned Int;
+  bool ExtractLow = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14088,117 +14106,58 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
-    // the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
   case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              Ops[0]->getType(), false, Ops, E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
+                              E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 16),
+                              llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
+                              E, "vfcvtn");
   case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    Tys[1] = Ops[1]->getType();
-    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
-                                        Ops[0], Builder.getInt64(0));
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+    llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
+                                        Builder.getInt64(0));
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
+                              Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index df4b25e015efd4..7c83541e83ba74 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4666,6 +4666,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitFP8NeonCall(llvm::Function *F,
                                SmallVectorImpl<llvm::Value *> &O,
                                llvm::Value *FPM, const char *name);
+  llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                  llvm::Type *Ty1, bool Extract,
+                                  SmallVectorImpl<llvm::Value *> &Ops,
+                                  const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);

>From 42e3e848425b50a963d35d390043a8d16950f45e Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 11:28:49 +0000
Subject: [PATCH 10/11] [fixup] Add target features test, remove redundant bf16
 guard

---
 clang/include/clang/Basic/arm_neon.td         |  2 +-
 .../acle_neon_fp8_cvt.c                       | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index cf1f8af513bf66..5fd2bfebd75982 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,7 +2125,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
   def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
   def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..2c7004c7968a46
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vcvt1_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f32_fpm(va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}}
+  (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm);
+  // expected-error at -1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}}
+}

>From 997ee58b444f4bc52f8c0334ad9ec4002863c3b2 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Jan 2025 11:36:36 +0000
Subject: [PATCH 11/11] [fixup] Clear the NoManglingQ flag for FP8

---
 clang/include/clang/Basic/arm_neon.td | 21 ++++++++++-----------
 clang/utils/TableGen/NeonEmitter.cpp  |  1 -
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 5fd2bfebd75982..0152482d3e51f1 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2127,25 +2127,24 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
-  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Hm">;
   def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
-  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Qm">;
-  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Qm">;
-  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Qm">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Hm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Hm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Hm">;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
-  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
   def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
-  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
-  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Qm">;
-  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Hm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Hm">;
 
   def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
-  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">;
-  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "m">;
-  def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "mQm">;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index fd971a4a532a8d..168960dbb7ecb9 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -897,7 +897,6 @@ void Type::applyTypespec(bool &Quad) {
     case 'm':
       Kind = MFloat8;
       ElementBitwidth = 8;
-      NoManglingQ = true;
       break;
     default:
       llvm_unreachable("Unhandled type code!");